Passed
Pull Request — master (#414)
by Osma
01:58
created

annif.cli.set_project_config_file_path()   A

Complexity

Conditions 3

Size

Total Lines 5
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 5
rs 10
c 0
b 0
f 0
cc 3
nop 3
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import click
10
import click_log
11
from flask import current_app
12
from flask.cli import FlaskGroup, ScriptInfo
13
import annif
14
import annif.corpus
15
import annif.eval
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter
21
from annif.exception import ConfigurationException, NotSupportedException
22
23
logger = annif.logger
24
click_log.basic_config(logger)
25
26
cli = FlaskGroup(create_app=annif.create_app)
27
28
29
def get_project(project_id):
30
    """
31
    Helper function to get a project by ID and bail out if it doesn't exist"""
32
    try:
33
        return annif.registry.get_project(project_id, min_access=Access.hidden)
34
    except ValueError:
35
        click.echo(
36
            "No projects found with id \'{0}\'.".format(project_id),
37
            err=True)
38
        sys.exit(1)
39
40
41
def open_documents(paths):
42
    """Helper function to open a document corpus from a list of pathnames,
43
    each of which is either a TSV file or a directory of TXT files. The
44
    corpus will be returned as an instance of DocumentCorpus."""
45
46
    def open_doc_path(path):
47
        """open a single path and return it as a DocumentCorpus"""
48
        if os.path.isdir(path):
49
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
50
        return annif.corpus.DocumentFile(path)
51
52
    if len(paths) == 0:
53
        logger.warning('Reading empty file')
54
        docs = open_doc_path(os.path.devnull)
55
    elif len(paths) == 1:
56
        docs = open_doc_path(paths[0])
57
    else:
58
        corpora = [open_doc_path(path) for path in paths]
59
        docs = annif.corpus.CombinedCorpus(corpora)
60
    return docs
61
62
63
def parse_backend_params(backend_param, project):
64
    """Parse a list of backend parameters given with the --backend-param
65
    option into a nested dict structure"""
66
    backend_params = collections.defaultdict(dict)
67
    for beparam in backend_param:
68
        backend, param = beparam.split('.', 1)
69
        key, val = param.split('=', 1)
70
        validate_backend_params(backend, beparam, project)
71
        backend_params[backend][key] = val
72
    return backend_params
73
74
75
def validate_backend_params(backend, beparam, project):
76
    if 'algorithm' in beparam:
77
        raise NotSupportedException('Algorithm overriding not supported.')
78
    if backend != project.config['backend']:
79
        raise ConfigurationException(
80
            'The backend {} in CLI option "-b {}" not matching the project'
81
            ' backend {}.'
82
            .format(backend, beparam, project.config['backend']))
83
84
85
def generate_filter_batches(subjects):
86
    filter_batches = collections.OrderedDict()
87
    for limit in range(1, 16):
88
        for threshold in [i * 0.05 for i in range(20)]:
89
            hit_filter = SuggestionFilter(subjects, limit, threshold)
90
            batch = annif.eval.EvaluationBatch(subjects)
91
            filter_batches[(limit, threshold)] = (hit_filter, batch)
92
    return filter_batches
93
94
95
def set_project_config_file_path(ctx, param, value):
96
    """Override the default path or the path given in env by CLI option"""
97
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
98
        if value:
99
            current_app.config['PROJECTS_FILE'] = value
100
101
102
def common_options(f):
103
    """Decorator to add common options for all CLI commands"""
104
    f = click.option(
105
        '-p', '--projects', help='Set path to projects.cfg',
106
        type=click.Path(dir_okay=False, exists=True),
107
        callback=set_project_config_file_path, expose_value=False,
108
        is_eager=True)(f)
109
    return click_log.simple_verbosity_option(logger)(f)
110
111
112
def backend_param_option(f):
113
    """Decorator to add an option for CLI commands to override BE parameters"""
114
    return click.option(
115
        '--backend-param', '-b', multiple=True,
116
        help='Override backend parameter of the config file. ' +
117
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)
118
119
120
@cli.command('list-projects')
121
@common_options
122
@click_log.simple_verbosity_option(logger, default='ERROR')
123
def run_list_projects():
124
    """
125
    List available projects.
126
    """
127
128
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
129
    header = template.format(
130
        "Project ID", "Project Name", "Language", "Trained")
131
    click.echo(header)
132
    click.echo("-" * len(header))
133
    for proj in annif.registry.get_projects(
134
            min_access=Access.private).values():
135
        click.echo(template.format(
136
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
137
138
139
@cli.command('show-project')
140
@click.argument('project_id')
141
@common_options
142
def run_show_project(project_id):
143
    """
144
    Show information about a project.
145
    """
146
147
    proj = get_project(project_id)
148
    click.echo(f'Project ID:        {proj.project_id}')
149
    click.echo(f'Project Name:      {proj.name}')
150
    click.echo(f'Language:          {proj.language}')
151
    click.echo(f'Access:            {proj.access.name}')
152
    click.echo(f'Trained:           {proj.is_trained}')
153
    click.echo(f'Modification time: {proj.modification_time}')
154
155
156
@cli.command('clear')
157
@click.argument('project_id')
158
@common_options
159
def run_clear_project(project_id):
160
    """
161
    Initialize the project to its original, untrained state.
162
    """
163
    proj = get_project(project_id)
164
    proj.remove_model_data()
165
166
167
@cli.command('loadvoc')
168
@click.argument('project_id')
169
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
170
@common_options
171
def run_loadvoc(project_id, subjectfile):
172
    """
173
    Load a vocabulary for a project.
174
    """
175
    proj = get_project(project_id)
176
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
177
        # SKOS/RDF file supported by rdflib
178
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
179
    else:
180
        # probably a TSV file
181
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
182
    proj.vocab.load_vocabulary(subjects, proj.language)
183
184
185
@cli.command('train')
186
@click.argument('project_id')
187
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
188
@click.option('--cached/--no-cached', default=False,
189
              help='Reuse preprocessed training data from previous run')
190
@backend_param_option
191
@common_options
192
def run_train(project_id, paths, cached, backend_param):
193
    """
194
    Train a project on a collection of documents.
195
    """
196
    proj = get_project(project_id)
197
    backend_params = parse_backend_params(backend_param, proj)
198
    if cached:
199
        if len(paths) > 0:
200
            raise click.UsageError(
201
                "Corpus paths cannot be given when using --cached option.")
202
        documents = 'cached'
203
    else:
204
        documents = open_documents(paths)
205
    proj.train(documents, backend_params)
206
207
208
@cli.command('learn')
209
@click.argument('project_id')
210
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
211
@backend_param_option
212
@common_options
213
def run_learn(project_id, paths, backend_param):
214
    """
215
    Further train an existing project on a collection of documents.
216
    """
217
    proj = get_project(project_id)
218
    backend_params = parse_backend_params(backend_param, proj)
219
    documents = open_documents(paths)
220
    proj.learn(documents, backend_params)
221
222
223
@cli.command('suggest')
224
@click.argument('project_id')
225
@click.option('--limit', default=10, help='Maximum number of subjects')
226
@click.option('--threshold', default=0.0, help='Minimum score threshold')
227
@backend_param_option
228
@common_options
229
def run_suggest(project_id, limit, threshold, backend_param):
230
    """
231
    Suggest subjects for a single document from standard input.
232
    """
233
    project = get_project(project_id)
234
    text = sys.stdin.read()
235
    backend_params = parse_backend_params(backend_param, project)
236
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
237
    hits = hit_filter(project.suggest(text, backend_params))
238
    for hit in hits.as_list(project.subjects):
239
        click.echo(
240
            "<{}>\t{}\t{}".format(
241
                hit.uri,
242
                '\t'.join(filter(None, (hit.label, hit.notation))),
243
                hit.score))
244
245
246
@cli.command('index')
247
@click.argument('project_id')
248
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
249
@click.option(
250
    '--suffix',
251
    default='.annif',
252
    help='File name suffix for result files')
253
@click.option('--force/--no-force', default=False,
254
              help='Force overwriting of existing result files')
255
@click.option('--limit', default=10, help='Maximum number of subjects')
256
@click.option('--threshold', default=0.0, help='Minimum score threshold')
257
@backend_param_option
258
@common_options
259
def run_index(project_id, directory, suffix, force,
260
              limit, threshold, backend_param):
261
    """
262
    Index a directory with documents, suggesting subjects for each document.
263
    Write the results in TSV files with the given suffix.
264
    """
265
    project = get_project(project_id)
266
    backend_params = parse_backend_params(backend_param, project)
267
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
268
269
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
270
            directory, require_subjects=False):
271
        with open(docfilename, encoding='utf-8-sig') as docfile:
272
            text = docfile.read()
273
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
274
        if os.path.exists(subjectfilename) and not force:
275
            click.echo(
276
                "Not overwriting {} (use --force to override)".format(
277
                    subjectfilename))
278
            continue
279
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
280
            results = project.suggest(text, backend_params)
281
            for hit in hit_filter(results).as_list(project.subjects):
282
                line = "<{}>\t{}\t{}".format(
283
                    hit.uri,
284
                    '\t'.join(filter(None, (hit.label, hit.notation))),
285
                    hit.score)
286
                click.echo(line, file=subjfile)
287
288
289
@cli.command('eval')
290
@click.argument('project_id')
291
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
292
@click.option('--limit', default=10, help='Maximum number of subjects')
293
@click.option('--threshold', default=0.0, help='Minimum score threshold')
294
@click.option(
295
    '--results-file',
296
    type=click.File(
297
        'w',
298
        encoding='utf-8',
299
        errors='ignore',
300
        lazy=True),
301
    help="""Specify file in order to write non-aggregated results per subject.
302
    File directory must exist, existing file will be overwritten.""")
303
@click.option('--jobs',
304
              default=1,
305
              help='Number of parallel jobs (0 means all CPUs)')
306
@backend_param_option
307
@common_options
308
def run_eval(
309
        project_id,
310
        paths,
311
        limit,
312
        threshold,
313
        results_file,
314
        jobs,
315
        backend_param):
316
    """
317
    Analyze documents and evaluate the result.
318
319
    Compare the results of automated indexing against a gold standard. The
320
    path may be either a TSV file with short documents or a directory with
321
    documents in separate files.
322
    """
323
324
    project = get_project(project_id)
325
    backend_params = parse_backend_params(backend_param, project)
326
327
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
328
329
    if results_file:
330
        try:
331
            print('', end='', file=results_file)
332
            click.echo('Writing per subject evaluation results to {!s}'.format(
333
                results_file.name))
334
        except Exception as e:
335
            raise NotSupportedException(
336
                "cannot open results-file for writing: " + str(e))
337
    docs = open_documents(paths)
338
339
    jobs, pool_class = annif.parallel.get_pool(jobs)
340
341
    project.initialize()
342
    psmap = annif.parallel.ProjectSuggestMap(
343
        project.registry, [project_id], backend_params, limit, threshold)
344
345
    with pool_class(jobs) as pool:
346
        for hits, uris, labels in pool.imap_unordered(
347
                psmap.suggest, docs.documents):
348
            eval_batch.evaluate(hits[project_id],
349
                                annif.corpus.SubjectSet((uris, labels)))
350
351
    template = "{0:<30}\t{1}"
352
    for metric, score in eval_batch.results(results_file=results_file).items():
353
        click.echo(template.format(metric + ":", score))
354
355
356
@cli.command('optimize')
357
@click.argument('project_id')
358
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
359
@backend_param_option
360
@common_options
361
def run_optimize(project_id, paths, backend_param):
362
    """
363
    Analyze documents, testing multiple limits and thresholds.
364
365
    Evaluate the analysis results for a directory with documents against a
366
    gold standard given in subject files. Test different limit/threshold
367
    values and report the precision, recall and F-measure of each combination
368
    of settings.
369
    """
370
    project = get_project(project_id)
371
    backend_params = parse_backend_params(backend_param, project)
372
373
    filter_batches = generate_filter_batches(project.subjects)
374
375
    ndocs = 0
376
    docs = open_documents(paths)
377
    for doc in docs.documents:
378
        hits = project.suggest(doc.text, backend_params)
379
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
380
        for hit_filter, batch in filter_batches.values():
381
            batch.evaluate(hit_filter(hits), gold_subjects)
382
        ndocs += 1
383
384
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
385
386
    best_scores = collections.defaultdict(float)
387
    best_params = {}
388
389
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
390
    # Store the batches in a list that gets consumed along the way
391
    # This way GC will have a chance to reclaim the memory
392
    filter_batches = list(filter_batches.items())
393
    while filter_batches:
394
        params, filter_batch = filter_batches.pop(0)
395
        metrics = ['Precision (doc avg)',
396
                   'Recall (doc avg)',
397
                   'F1 score (doc avg)',
398
                   'NDCG@5',
399
                   'NDCG@10']
400
        results = filter_batch[1].results(metrics=metrics)
401
        for metric, score in results.items():
402
            if score >= best_scores[metric]:
403
                best_scores[metric] = score
404
                best_params[metric] = params
405
        click.echo(
406
            template.format(
407
                params[0],
408
                params[1],
409
                results['Precision (doc avg)'],
410
                results['Recall (doc avg)'],
411
                results['F1 score (doc avg)']))
412
413
    click.echo()
414
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
415
    for metric in metrics:
416
        click.echo(
417
            template2.format(
418
                metric,
419
                best_scores[metric],
420
                best_params[metric][0],
421
                best_params[metric][1]))
422
    click.echo("Documents evaluated:\t{}".format(ndocs))
423
424
425
@cli.command('hyperopt')
426
@click.argument('project_id')
427
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
428
@click.option('--trials', default=10, help='Number of trials')
429
@click.option('--jobs',
430
              default=1,
431
              help='Number of parallel runs (-1 means all CPUs)')
432
@click.option('--metric', default='NDCG', help='Metric to optimize')
433
@click.option(
434
    '--results-file',
435
    type=click.File(
436
        'w',
437
        encoding='utf-8',
438
        errors='ignore',
439
        lazy=True),
440
    help="""Specify file path to write trial results as CSV.
441
    File directory must exist, existing file will be overwritten.""")
442
@common_options
443
def run_hyperopt(project_id, paths, trials, jobs, metric, results_file):
444
    """
445
    Optimize the hyperparameters of a project using a validation corpus.
446
    """
447
    proj = get_project(project_id)
448
    documents = open_documents(paths)
449
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
450
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
451
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
452
    click.echo("---")
453
    for line in rec.lines:
454
        click.echo(line)
455
    click.echo("---")
456
457
458
if __name__ == '__main__':
459
    cli()
460