Passed
Pull Request — master (#557)
by Osma
06:27
created

annif.cli.run_list_projects()   A

Complexity

Conditions 2

Size

Total Lines 17
Code Lines 13

Duplication

Lines 17
Ratio 100 %

Importance

Changes 0
Metric Value
cc 2
eloc 13
nop 0
dl 17
loc 17
rs 9.75
c 0
b 0
f 0
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import json
10
import click
11
import click_log
12
from flask import current_app
13
from flask.cli import FlaskGroup, ScriptInfo
14
import annif
15
import annif.corpus
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter, ListSuggestionResult
21
from annif.exception import ConfigurationException, NotSupportedException
22
23
logger = annif.logger
24
click_log.basic_config(logger)
25
26
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
27
cli = click.version_option(message='%(version)s')(cli)
28
29
30
def get_project(project_id):
31
    """
32
    Helper function to get a project by ID and bail out if it doesn't exist"""
33
    try:
34
        return annif.registry.get_project(project_id,
35
                                          min_access=Access.private)
36
    except ValueError:
37
        click.echo(
38
            "No projects found with id \'{0}\'.".format(project_id),
39
            err=True)
40
        sys.exit(1)
41
42
43 View Code Duplication
def open_documents(paths, docs_limit):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
44
    """Helper function to open a document corpus from a list of pathnames,
45
    each of which is either a TSV file or a directory of TXT files. The
46
    corpus will be returned as an instance of DocumentCorpus or
47
    LimitingDocumentCorpus."""
48
49
    def open_doc_path(path):
50
        """open a single path and return it as a DocumentCorpus"""
51
        if os.path.isdir(path):
52
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
53
        return annif.corpus.DocumentFile(path)
54
55
    if len(paths) == 0:
56
        logger.warning('Reading empty file')
57
        docs = open_doc_path(os.path.devnull)
58
    elif len(paths) == 1:
59
        docs = open_doc_path(paths[0])
60
    else:
61
        corpora = [open_doc_path(path) for path in paths]
62
        docs = annif.corpus.CombinedCorpus(corpora)
63
    if docs_limit is not None:
64
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
65
    return docs
66
67
68 View Code Duplication
def parse_backend_params(backend_param, project):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
69
    """Parse a list of backend parameters given with the --backend-param
70
    option into a nested dict structure"""
71
    backend_params = collections.defaultdict(dict)
72
    for beparam in backend_param:
73
        backend, param = beparam.split('.', 1)
74
        key, val = param.split('=', 1)
75
        validate_backend_params(backend, beparam, project)
76
        backend_params[backend][key] = val
77
    return backend_params
78
79
80
def validate_backend_params(backend, beparam, project):
81
    if backend != project.config['backend']:
82
        raise ConfigurationException(
83
            'The backend {} in CLI option "-b {}" not matching the project'
84
            ' backend {}.'
85
            .format(backend, beparam, project.config['backend']))
86
87
88
BATCH_MAX_LIMIT = 15
89
90
91
def generate_filter_batches(subjects):
92
    import annif.eval
93
    filter_batches = collections.OrderedDict()
94
    for limit in range(1, BATCH_MAX_LIMIT + 1):
95
        for threshold in [i * 0.05 for i in range(20)]:
96
            hit_filter = SuggestionFilter(subjects, limit, threshold)
97
            batch = annif.eval.EvaluationBatch(subjects)
98
            filter_batches[(limit, threshold)] = (hit_filter, batch)
99
    return filter_batches
100
101
102
def set_project_config_file_path(ctx, param, value):
103
    """Override the default path or the path given in env by CLI option"""
104
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
105
        if value:
106
            current_app.config['PROJECTS_FILE'] = value
107
108
109
def common_options(f):
110
    """Decorator to add common options for all CLI commands"""
111
    f = click.option(
112
        '-p', '--projects', help='Set path to projects.cfg',
113
        type=click.Path(dir_okay=False, exists=True),
114
        callback=set_project_config_file_path, expose_value=False,
115
        is_eager=True)(f)
116
    return click_log.simple_verbosity_option(logger)(f)
117
118
119
def backend_param_option(f):
120
    """Decorator to add an option for CLI commands to override BE parameters"""
121
    return click.option(
122
        '--backend-param', '-b', multiple=True,
123
        help='Override backend parameter of the config file. ' +
124
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)
125
126
127 View Code Duplication
@cli.command('list-projects')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
128
@common_options
129
@click_log.simple_verbosity_option(logger, default='ERROR')
130
def run_list_projects():
131
    """
132
    List available projects.
133
    """
134
135
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
136
    header = template.format(
137
        "Project ID", "Project Name", "Language", "Trained")
138
    click.echo(header)
139
    click.echo("-" * len(header))
140
    for proj in annif.registry.get_projects(
141
            min_access=Access.private).values():
142
        click.echo(template.format(
143
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
144
145
146
@cli.command('show-project')
147
@click.argument('project_id')
148
@common_options
149
def run_show_project(project_id):
150
    """
151
    Show information about a project.
152
    """
153
154
    proj = get_project(project_id)
155
    click.echo(f'Project ID:        {proj.project_id}')
156
    click.echo(f'Project Name:      {proj.name}')
157
    click.echo(f'Language:          {proj.language}')
158
    click.echo(f'Access:            {proj.access.name}')
159
    click.echo(f'Trained:           {proj.is_trained}')
160
    click.echo(f'Modification time: {proj.modification_time}')
161
162
163
@cli.command('clear')
164
@click.argument('project_id')
165
@common_options
166
def run_clear_project(project_id):
167
    """
168
    Initialize the project to its original, untrained state.
169
    """
170
    proj = get_project(project_id)
171
    proj.remove_model_data()
172
173
174 View Code Duplication
@cli.command('loadvoc')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
175
@click.argument('project_id')
176
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
177
@common_options
178
def run_loadvoc(project_id, subjectfile):
179
    """
180
    Load a vocabulary for a project.
181
    """
182
    proj = get_project(project_id)
183
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
184
        # SKOS/RDF file supported by rdflib
185
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
186
    else:
187
        # probably a TSV file
188
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
189
    proj.vocab.load_vocabulary(subjects, proj.language)
190
191
192 View Code Duplication
@cli.command('train')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
193
@click.argument('project_id')
194
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
195
@click.option('--cached/--no-cached', '-c/-C', default=False,
196
              help='Reuse preprocessed training data from previous run')
197
@click.option('--docs-limit', '-d', default=None,
198
              type=click.IntRange(0, None),
199
              help='Maximum number of documents to use')
200
@click.option('--jobs',
201
              '-j',
202
              default=0,
203
              help='Number of parallel jobs (0 means choose automatically)')
204
@backend_param_option
205
@common_options
206
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
207
    """
208
    Train a project on a collection of documents.
209
    """
210
    proj = get_project(project_id)
211
    backend_params = parse_backend_params(backend_param, proj)
212
    if cached:
213
        if len(paths) > 0:
214
            raise click.UsageError(
215
                "Corpus paths cannot be given when using --cached option.")
216
        documents = 'cached'
217
    else:
218
        documents = open_documents(paths, docs_limit)
219
    proj.train(documents, backend_params, jobs)
220
221
222 View Code Duplication
@cli.command('learn')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
223
@click.argument('project_id')
224
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
225
@click.option('--docs-limit', '-d', default=None,
226
              type=click.IntRange(0, None),
227
              help='Maximum number of documents to use')
228
@backend_param_option
229
@common_options
230
def run_learn(project_id, paths, docs_limit, backend_param):
231
    """
232
    Further train an existing project on a collection of documents.
233
    """
234
    proj = get_project(project_id)
235
    backend_params = parse_backend_params(backend_param, proj)
236
    documents = open_documents(paths, docs_limit)
237
    proj.learn(documents, backend_params)
238
239
240 View Code Duplication
@cli.command('suggest')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
241
@click.argument('project_id')
242
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
243
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
244
@backend_param_option
245
@common_options
246
def run_suggest(project_id, limit, threshold, backend_param):
247
    """
248
    Suggest subjects for a single document from standard input.
249
    """
250
    project = get_project(project_id)
251
    text = sys.stdin.read()
252
    backend_params = parse_backend_params(backend_param, project)
253
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
254
    hits = hit_filter(project.suggest(text, backend_params))
255
    for hit in hits.as_list(project.subjects):
256
        click.echo(
257
            "<{}>\t{}\t{}".format(
258
                hit.uri,
259
                '\t'.join(filter(None, (hit.label, hit.notation))),
260
                hit.score))
261
262
263 View Code Duplication
@cli.command('index')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
264
@click.argument('project_id')
265
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
266
@click.option(
267
    '--suffix',
268
    '-s',
269
    default='.annif',
270
    help='File name suffix for result files')
271
@click.option('--force/--no-force', '-f/-F', default=False,
272
              help='Force overwriting of existing result files')
273
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
274
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
275
@backend_param_option
276
@common_options
277
def run_index(project_id, directory, suffix, force,
278
              limit, threshold, backend_param):
279
    """
280
    Index a directory with documents, suggesting subjects for each document.
281
    Write the results in TSV files with the given suffix.
282
    """
283
    project = get_project(project_id)
284
    backend_params = parse_backend_params(backend_param, project)
285
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
286
287
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
288
            directory, require_subjects=False):
289
        with open(docfilename, encoding='utf-8-sig') as docfile:
290
            text = docfile.read()
291
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
292
        if os.path.exists(subjectfilename) and not force:
293
            click.echo(
294
                "Not overwriting {} (use --force to override)".format(
295
                    subjectfilename))
296
            continue
297
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
298
            results = project.suggest(text, backend_params)
299
            for hit in hit_filter(results).as_list(project.subjects):
300
                line = "<{}>\t{}\t{}".format(
301
                    hit.uri,
302
                    '\t'.join(filter(None, (hit.label, hit.notation))),
303
                    hit.score)
304
                click.echo(line, file=subjfile)
305
306
307 View Code Duplication
@cli.command('eval')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
308
@click.argument('project_id')
309
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
310
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
311
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
312
@click.option('--docs-limit', '-d', default=None,
313
              type=click.IntRange(0, None),
314
              help='Maximum number of documents to use')
315
@click.option(
316
    '--metrics-file',
317
    '-M',
318
    type=click.File(
319
        'w',
320
        encoding='utf-8',
321
        errors='ignore',
322
        lazy=True),
323
    help="""Specify file in order to write evaluation metrics in JSON format.
324
    File directory must exist, existing file will be overwritten.""")
325
@click.option(
326
    '--results-file',
327
    '-r',
328
    type=click.File(
329
        'w',
330
        encoding='utf-8',
331
        errors='ignore',
332
        lazy=True),
333
    help="""Specify file in order to write non-aggregated results per subject.
334
    File directory must exist, existing file will be overwritten.""")
335
@click.option('--jobs',
336
              '-j',
337
              default=1,
338
              help='Number of parallel jobs (0 means all CPUs)')
339
@backend_param_option
340
@common_options
341
def run_eval(
342
        project_id,
343
        paths,
344
        limit,
345
        threshold,
346
        docs_limit,
347
        metrics_file,
348
        results_file,
349
        jobs,
350
        backend_param):
351
    """
352
    Analyze documents and evaluate the result.
353
354
    Compare the results of automated indexing against a gold standard. The
355
    path may be either a TSV file with short documents or a directory with
356
    documents in separate files.
357
    """
358
359
    project = get_project(project_id)
360
    backend_params = parse_backend_params(backend_param, project)
361
362
    import annif.eval
363
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
364
365
    if results_file:
366
        try:
367
            print('', end='', file=results_file)
368
            click.echo('Writing per subject evaluation results to {!s}'.format(
369
                results_file.name))
370
        except Exception as e:
371
            raise NotSupportedException(
372
                "cannot open results-file for writing: " + str(e))
373
    docs = open_documents(paths, docs_limit)
374
375
    jobs, pool_class = annif.parallel.get_pool(jobs)
376
377
    project.initialize(parallel=True)
378
    psmap = annif.parallel.ProjectSuggestMap(
379
        project.registry, [project_id], backend_params, limit, threshold)
380
381
    with pool_class(jobs) as pool:
382
        for hits, uris, labels in pool.imap_unordered(
383
                psmap.suggest, docs.documents):
384
            eval_batch.evaluate(hits[project_id],
385
                                annif.corpus.SubjectSet((uris, labels)))
386
387
    template = "{0:<30}\t{1}"
388
    metrics = eval_batch.results(results_file=results_file)
389
    for metric, score in metrics.items():
390
        click.echo(template.format(metric + ":", score))
391
    if metrics_file:
392
        print(metrics)
393
        json.dump(metrics, metrics_file, indent=2)
394
395
396 View Code Duplication
@cli.command('optimize')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
397
@click.argument('project_id')
398
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
399
@click.option('--docs-limit', '-d', default=None,
400
              type=click.IntRange(0, None),
401
              help='Maximum number of documents to use')
402
@backend_param_option
403
@common_options
404
def run_optimize(project_id, paths, docs_limit, backend_param):
405
    """
406
    Analyze documents, testing multiple limits and thresholds.
407
408
    Evaluate the analysis results for a directory with documents against a
409
    gold standard given in subject files. Test different limit/threshold
410
    values and report the precision, recall and F-measure of each combination
411
    of settings.
412
    """
413
    project = get_project(project_id)
414
    backend_params = parse_backend_params(backend_param, project)
415
416
    filter_batches = generate_filter_batches(project.subjects)
417
418
    ndocs = 0
419
    docs = open_documents(paths, docs_limit)
420
    for doc in docs.documents:
421
        raw_hits = project.suggest(doc.text, backend_params)
422
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
423
        assert isinstance(hits, ListSuggestionResult), \
424
            "Optimize should only be done with ListSuggestionResult " + \
425
            "as it would be very slow with VectorSuggestionResult."
426
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
427
        for hit_filter, batch in filter_batches.values():
428
            batch.evaluate(hit_filter(hits), gold_subjects)
429
        ndocs += 1
430
431
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
432
433
    best_scores = collections.defaultdict(float)
434
    best_params = {}
435
436
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
437
    # Store the batches in a list that gets consumed along the way
438
    # This way GC will have a chance to reclaim the memory
439
    filter_batches = list(filter_batches.items())
440
    while filter_batches:
441
        params, filter_batch = filter_batches.pop(0)
442
        metrics = ['Precision (doc avg)',
443
                   'Recall (doc avg)',
444
                   'F1 score (doc avg)']
445
        results = filter_batch[1].results(metrics=metrics)
446
        for metric, score in results.items():
447
            if score >= best_scores[metric]:
448
                best_scores[metric] = score
449
                best_params[metric] = params
450
        click.echo(
451
            template.format(
452
                params[0],
453
                params[1],
454
                results['Precision (doc avg)'],
455
                results['Recall (doc avg)'],
456
                results['F1 score (doc avg)']))
457
458
    click.echo()
459
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
460
    for metric in metrics:
461
        click.echo(
462
            template2.format(
463
                metric,
464
                best_scores[metric],
465
                best_params[metric][0],
466
                best_params[metric][1]))
467
    click.echo("Documents evaluated:\t{}".format(ndocs))
468
469
470 View Code Duplication
@cli.command('hyperopt')
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
471
@click.argument('project_id')
472
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
473
@click.option('--docs-limit', '-d', default=None,
474
              type=click.IntRange(0, None),
475
              help='Maximum number of documents to use')
476
@click.option('--trials', '-T', default=10, help='Number of trials')
477
@click.option('--jobs',
478
              '-j',
479
              default=1,
480
              help='Number of parallel runs (0 means all CPUs)')
481
@click.option('--metric', '-m', default='NDCG',
482
              help='Metric to optimize (default: NDCG)')
483
@click.option(
484
    '--results-file',
485
    '-r',
486
    type=click.File(
487
        'w',
488
        encoding='utf-8',
489
        errors='ignore',
490
        lazy=True),
491
    help="""Specify file path to write trial results as CSV.
492
    File directory must exist, existing file will be overwritten.""")
493
@common_options
494
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
495
                 results_file):
496
    """
497
    Optimize the hyperparameters of a project using a validation corpus.
498
    """
499
    proj = get_project(project_id)
500
    documents = open_documents(paths, docs_limit)
501
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
502
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
503
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
504
    click.echo("---")
505
    for line in rec.lines:
506
        click.echo(line)
507
    click.echo("---")
508
509
510
if __name__ == '__main__':
511
    cli()
512