Passed
Pull Request — master (#630)
by Osma
02:49
created

annif.cli.run_index()   C

Complexity

Conditions 8

Size

Total Lines 51
Code Lines 45

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 45
nop 8
dl 0
loc 51
rs 6.9333
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import json
10
import click
11
import click_log
12
from flask import current_app
13
from flask.cli import FlaskGroup, ScriptInfo
14
import annif
15
import annif.corpus
16
import annif.parallel
17
import annif.project
18
import annif.registry
19
from annif.project import Access
20
from annif.suggestion import SuggestionFilter, ListSuggestionResult
21
from annif.exception import ConfigurationException, NotSupportedException
22
from annif.exception import NotInitializedException
23
from annif.util import metric_code
24
25
logger = annif.logger
26
click_log.basic_config(logger)
27
28
cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
29
cli = click.version_option(message='%(version)s')(cli)
30
31
32
def get_project(project_id):
33
    """
34
    Helper function to get a project by ID and bail out if it doesn't exist"""
35
    try:
36
        return annif.registry.get_project(project_id,
37
                                          min_access=Access.private)
38
    except ValueError:
39
        click.echo(
40
            "No projects found with id \'{0}\'.".format(project_id),
41
            err=True)
42
        sys.exit(1)
43
44
45
def get_vocab(vocab_id):
46
    """
47
    Helper function to get a vocabulary by ID and bail out if it doesn't
48
    exist"""
49
    try:
50
        return annif.registry.get_vocab(vocab_id,
51
                                        min_access=Access.private)
52
    except ValueError:
53
        click.echo(
54
            f"No vocabularies found with the id '{vocab_id}'.",
55
            err=True)
56
        sys.exit(1)
57
58
59
def open_documents(paths, subject_index, vocab_lang, docs_limit):
60
    """Helper function to open a document corpus from a list of pathnames,
61
    each of which is either a TSV file or a directory of TXT files. For
62
    directories with subjects in TSV files, the given vocabulary language
63
    will be used to convert subject labels into URIs. The corpus will be
64
    returned as an instance of DocumentCorpus or LimitingDocumentCorpus."""
65
66
    def open_doc_path(path, subject_index):
67
        """open a single path and return it as a DocumentCorpus"""
68
        if os.path.isdir(path):
69
            return annif.corpus.DocumentDirectory(path, subject_index,
70
                                                  vocab_lang,
71
                                                  require_subjects=True)
72
        return annif.corpus.DocumentFile(path, subject_index)
73
74
    if len(paths) == 0:
75
        logger.warning('Reading empty file')
76
        docs = open_doc_path(os.path.devnull, subject_index)
77
    elif len(paths) == 1:
78
        docs = open_doc_path(paths[0], subject_index)
79
    else:
80
        corpora = [open_doc_path(path, subject_index) for path in paths]
81
        docs = annif.corpus.CombinedCorpus(corpora)
82
    if docs_limit is not None:
83
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
84
    return docs
85
86
87
def parse_backend_params(backend_param, project):
88
    """Parse a list of backend parameters given with the --backend-param
89
    option into a nested dict structure"""
90
    backend_params = collections.defaultdict(dict)
91
    for beparam in backend_param:
92
        backend, param = beparam.split('.', 1)
93
        key, val = param.split('=', 1)
94
        validate_backend_params(backend, beparam, project)
95
        backend_params[backend][key] = val
96
    return backend_params
97
98
99
def validate_backend_params(backend, beparam, project):
100
    if backend != project.config['backend']:
101
        raise ConfigurationException(
102
            'The backend {} in CLI option "-b {}" not matching the project'
103
            ' backend {}.'
104
            .format(backend, beparam, project.config['backend']))
105
106
107
BATCH_MAX_LIMIT = 15
108
109
110
def generate_filter_batches(subjects):
111
    import annif.eval
112
    filter_batches = collections.OrderedDict()
113
    for limit in range(1, BATCH_MAX_LIMIT + 1):
114
        for threshold in [i * 0.05 for i in range(20)]:
115
            hit_filter = SuggestionFilter(subjects, limit, threshold)
116
            batch = annif.eval.EvaluationBatch(subjects)
117
            filter_batches[(limit, threshold)] = (hit_filter, batch)
118
    return filter_batches
119
120
121
def set_project_config_file_path(ctx, param, value):
122
    """Override the default path or the path given in env by CLI option"""
123
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
124
        if value:
125
            current_app.config['PROJECTS_CONFIG_PATH'] = value
126
127
128
def common_options(f):
129
    """Decorator to add common options for all CLI commands"""
130
    f = click.option(
131
        '-p', '--projects',
132
        help='Set path to project configuration file or directory',
133
        type=click.Path(dir_okay=True, exists=True),
134
        callback=set_project_config_file_path, expose_value=False,
135
        is_eager=True)(f)
136
    return click_log.simple_verbosity_option(logger)(f)
137
138
139
def backend_param_option(f):
140
    """Decorator to add an option for CLI commands to override BE parameters"""
141
    return click.option(
142
        '--backend-param', '-b', multiple=True,
143
        help='Override backend parameter of the config file. ' +
144
        'Syntax: `-b <backend>.<parameter>=<value>`.')(f)
145
146
147
@cli.command('list-projects')
148
@common_options
149
@click_log.simple_verbosity_option(logger, default='ERROR')
150
def run_list_projects():
151
    """
152
    List available projects.
153
    \f
154
    Show a list of currently defined projects. Projects are defined in a
155
    configuration file, normally called ``projects.cfg``. See `Project
156
    configuration
157
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
158
    for details.
159
    """
160
161
    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
162
    header = template.format(
163
        "Project ID", "Project Name", "Language", "Trained")
164
    click.echo(header)
165
    click.echo("-" * len(header))
166
    for proj in annif.registry.get_projects(
167
            min_access=Access.private).values():
168
        click.echo(template.format(
169
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))
170
171
172
@cli.command('show-project')
173
@click.argument('project_id')
174
@common_options
175
def run_show_project(project_id):
176
    """
177
    Show information about a project.
178
    """
179
180
    proj = get_project(project_id)
181
    click.echo(f'Project ID:        {proj.project_id}')
182
    click.echo(f'Project Name:      {proj.name}')
183
    click.echo(f'Language:          {proj.language}')
184
    click.echo(f'Vocabulary:        {proj.vocab.vocab_id}')
185
    click.echo(f'Vocab language:    {proj.vocab_lang}')
186
    click.echo(f'Access:            {proj.access.name}')
187
    click.echo(f'Trained:           {proj.is_trained}')
188
    click.echo(f'Modification time: {proj.modification_time}')
189
190
191
@cli.command('clear')
192
@click.argument('project_id')
193
@common_options
194
def run_clear_project(project_id):
195
    """
196
    Initialize the project to its original, untrained state.
197
    """
198
    proj = get_project(project_id)
199
    proj.remove_model_data()
200
201
202
@cli.command('list-vocabs')
203
@common_options
204
@click_log.simple_verbosity_option(logger, default='ERROR')
205
def run_list_vocabs():
206
    """
207
    List available vocabularies.
208
    """
209
210
    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
211
    header = template.format(
212
        "Vocabulary ID", "Languages", "Size", "Loaded")
213
    click.echo(header)
214
    click.echo("-" * len(header))
215
    for vocab in annif.registry.get_vocabs(
216
            min_access=Access.private).values():
217
        try:
218
            languages = ','.join(sorted(vocab.languages))
219
            size = len(vocab)
220
            loaded = True
221
        except NotInitializedException:
222
            languages = '-'
223
            size = '-'
224
            loaded = False
225
        click.echo(template.format(
226
            vocab.vocab_id, languages, size, str(loaded)))
227
228
229
@cli.command('load-vocab')
230
@click.argument('vocab_id')
231
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
232
@click.option('--language', '-L', help='Language of subject file')
233
@click.option('--force', '-f', default=False, is_flag=True,
234
              help='Replace existing vocabulary completely ' +
235
                   'instead of updating it')
236
@common_options
237
def run_load_vocab(vocab_id, language, force, subjectfile):
238
    """
239
    Load a vocabulary from a subject file.
240
    """
241
    vocab = get_vocab(vocab_id)
242
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
243
        # SKOS/RDF file supported by rdflib
244
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
245
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
246
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
247
        # CSV file
248
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
249
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
250
    else:
251
        # probably a TSV file - we need to know its language
252
        if not language:
253
            click.echo("Please use --language option to set the language of " +
254
                       "a TSV vocabulary.", err=True)
255
            sys.exit(1)
256
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
257
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
258
    vocab.load_vocabulary(subjects, force=force)
259
260
261
@cli.command('train')
262
@click.argument('project_id')
263
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
264
@click.option('--cached/--no-cached', '-c/-C', default=False,
265
              help='Reuse preprocessed training data from previous run')
266
@click.option('--docs-limit', '-d', default=None,
267
              type=click.IntRange(0, None),
268
              help='Maximum number of documents to use')
269
@click.option('--jobs',
270
              '-j',
271
              default=0,
272
              help='Number of parallel jobs (0 means choose automatically)')
273
@backend_param_option
274
@common_options
275
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
276
    """
277
    Train a project on a collection of documents.
278
    \f
279
    This will train the project using the documents from ``PATHS`` (directories
280
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
281
    is set, preprocessed training data from the previous run is reused instead
282
    of documents input; see `Reusing preprocessed training data
283
    <https://github.com/NatLibFi/Annif/wiki/
284
    Reusing-preprocessed-training-data>`_.
285
    """
286
    proj = get_project(project_id)
287
    backend_params = parse_backend_params(backend_param, proj)
288
    if cached:
289
        if len(paths) > 0:
290
            raise click.UsageError(
291
                "Corpus paths cannot be given when using --cached option.")
292
        documents = 'cached'
293
    else:
294
        documents = open_documents(paths, proj.subjects,
295
                                   proj.vocab_lang, docs_limit)
296
    proj.train(documents, backend_params, jobs)
297
298
299
@cli.command('learn')
300
@click.argument('project_id')
301
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
302
@click.option('--docs-limit', '-d', default=None,
303
              type=click.IntRange(0, None),
304
              help='Maximum number of documents to use')
305
@backend_param_option
306
@common_options
307
def run_learn(project_id, paths, docs_limit, backend_param):
308
    """
309
    Further train an existing project on a collection of documents.
310
    \f
311
    Similar to the ``train`` command. This will continue training an already
312
    trained project using the documents given by ``PATHS`` in a single batch
313
    operation. Not supported by all backends.
314
    """
315
    proj = get_project(project_id)
316
    backend_params = parse_backend_params(backend_param, proj)
317
    documents = open_documents(paths, proj.subjects,
318
                               proj.vocab_lang, docs_limit)
319
    proj.learn(documents, backend_params)
320
321
322
@cli.command('suggest')
323
@click.argument('project_id')
324
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
325
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
326
@click.option('--language', '-L', help='Language of subject labels')
327
@backend_param_option
328
@common_options
329
def run_suggest(project_id, limit, threshold, language, backend_param):
330
    """
331
    Suggest subjects for a single document from standard input.
332
    \f
333
    This will read a text document from standard input and suggest subjects for
334
    it.
335
    """
336
    project = get_project(project_id)
337
    text = sys.stdin.read()
338
    lang = language or project.vocab_lang
339
    if lang not in project.vocab.languages:
340
        raise click.BadParameter(
341
            f'language "{lang}" not supported by vocabulary')
342
    backend_params = parse_backend_params(backend_param, project)
343
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
344
    hits = hit_filter(project.suggest(text, backend_params))
345
    for hit in hits.as_list():
346
        subj = project.subjects[hit.subject_id]
347
        click.echo(
348
            "<{}>\t{}\t{}".format(
349
                subj.uri,
350
                '\t'.join(filter(None,
351
                                 (subj.labels[lang],
352
                                  subj.notation))),
353
                hit.score))
354
355
356
@cli.command('index')
357
@click.argument('project_id')
358
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
359
@click.option(
360
    '--suffix',
361
    '-s',
362
    default='.annif',
363
    help='File name suffix for result files')
364
@click.option('--force/--no-force', '-f/-F', default=False,
365
              help='Force overwriting of existing result files')
366
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
367
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
368
@click.option('--language', '-L', help='Language of subject labels')
369
@backend_param_option
370
@common_options
371
def run_index(project_id, directory, suffix, force,
372
              limit, threshold, language, backend_param):
373
    """
374
    Index a directory with documents, suggesting subjects for each document.
375
    Write the results in TSV files with the given suffix (``.annif`` by
376
    default).
377
    """
378
    project = get_project(project_id)
379
    lang = language or project.vocab_lang
380
    if lang not in project.vocab.languages:
381
        raise click.BadParameter(
382
            f'language "{lang}" not supported by vocabulary')
383
    backend_params = parse_backend_params(backend_param, project)
384
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
385
386
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
387
            directory, project.subjects, project.vocab_lang,
388
            require_subjects=False):
389
        with open(docfilename, encoding='utf-8-sig') as docfile:
390
            text = docfile.read()
391
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
392
        if os.path.exists(subjectfilename) and not force:
393
            click.echo(
394
                "Not overwriting {} (use --force to override)".format(
395
                    subjectfilename))
396
            continue
397
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
398
            results = project.suggest(text, backend_params)
399
            for hit in hit_filter(results).as_list():
400
                subj = project.subjects[hit.subject_id]
401
                line = "<{}>\t{}\t{}".format(
402
                    subj.uri,
403
                    '\t'.join(filter(None, (subj.labels[lang],
404
                                            subj.notation))),
405
                    hit.score)
406
                click.echo(line, file=subjfile)
407
408
409
@cli.command('eval')
410
@click.argument('project_id')
411
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
412
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
413
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
414
@click.option('--docs-limit', '-d', default=None,
415
              type=click.IntRange(0, None),
416
              help='Maximum number of documents to use')
417
@click.option('--metric', '-m', default=[], multiple=True,
418
              help='Metric to calculate (default: all)')
419
@click.option(
420
    '--metrics-file',
421
    '-M',
422
    type=click.File(
423
        'w',
424
        encoding='utf-8',
425
        errors='ignore',
426
        lazy=True),
427
    help="""Specify file in order to write evaluation metrics in JSON format.
428
    File directory must exist, existing file will be overwritten.""")
429
@click.option(
430
    '--results-file',
431
    '-r',
432
    type=click.File(
433
        'w',
434
        encoding='utf-8',
435
        errors='ignore',
436
        lazy=True),
437
    help="""Specify file in order to write non-aggregated results per subject.
438
    File directory must exist, existing file will be overwritten.""")
439
@click.option('--jobs',
440
              '-j',
441
              default=1,
442
              help='Number of parallel jobs (0 means all CPUs)')
443
@backend_param_option
444
@common_options
445
def run_eval(
446
        project_id,
447
        paths,
448
        limit,
449
        threshold,
450
        docs_limit,
451
        metric,
452
        metrics_file,
453
        results_file,
454
        jobs,
455
        backend_param):
456
    """
457
    Suggest subjects for documents and evaluate the results by comparing
458
    against a gold standard.
459
    \f
460
    With this command the documents from ``PATHS`` (directories or possibly
461
    gzipped TSV files) will be assigned subject suggestions and then
462
    statistical measures are calculated that quantify how well the suggested
463
    subjects match the gold-standard subjects in the documents.
464
465
    Normally the output is the list of the metrics calculated across documents.
466
    If ``--results-file <FILENAME>`` option is given, the metrics are
467
    calculated separately for each subject, and written to the given file.
468
    """
469
470
    project = get_project(project_id)
471
    backend_params = parse_backend_params(backend_param, project)
472
473
    import annif.eval
474
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
475
476
    if results_file:
477
        try:
478
            print('', end='', file=results_file)
479
            click.echo('Writing per subject evaluation results to {!s}'.format(
480
                results_file.name))
481
        except Exception as e:
482
            raise NotSupportedException(
483
                "cannot open results-file for writing: " + str(e))
484
    docs = open_documents(paths, project.subjects,
485
                          project.vocab_lang, docs_limit)
486
487
    jobs, pool_class = annif.parallel.get_pool(jobs)
488
489
    project.initialize(parallel=True)
490
    psmap = annif.parallel.ProjectSuggestMap(
491
        project.registry, [project_id], backend_params, limit, threshold)
492
493
    with pool_class(jobs) as pool:
494
        for hits, subject_set in pool.imap_unordered(
495
                psmap.suggest, docs.documents):
496
            eval_batch.evaluate(hits[project_id],
497
                                subject_set)
498
499
    template = "{0:<30}\t{1}"
500
    metrics = eval_batch.results(metrics=metric,
501
                                 results_file=results_file,
502
                                 language=project.vocab_lang)
503
    for metric, score in metrics.items():
504
        click.echo(template.format(metric + ":", score))
505
    if metrics_file:
506
        json.dump(
507
            {metric_code(mname): val for mname, val in metrics.items()},
508
            metrics_file, indent=2)
509
510
511
@cli.command('optimize')
512
@click.argument('project_id')
513
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
514
@click.option('--docs-limit', '-d', default=None,
515
              type=click.IntRange(0, None),
516
              help='Maximum number of documents to use')
517
@backend_param_option
518
@common_options
519
def run_optimize(project_id, paths, docs_limit, backend_param):
520
    """
521
    Suggest subjects for documents, testing multiple limits and thresholds.
522
    \f
523
    This command will use different limit (maximum number of subjects) and
524
    score threshold values when assigning subjects to each document given by
525
    ``PATHS`` and compare the results against the gold standard subjects in the
526
    documents. The output is a list of parameter combinations and their scores.
527
    From the output, you can determine the optimum limit and threshold
528
    parameters depending on which measure you want to target.
529
    """
530
    project = get_project(project_id)
531
    backend_params = parse_backend_params(backend_param, project)
532
533
    filter_batches = generate_filter_batches(project.subjects)
534
535
    ndocs = 0
536
    docs = open_documents(paths, project.subjects,
537
                          project.vocab_lang, docs_limit)
538
    for doc in docs.documents:
539
        raw_hits = project.suggest(doc.text, backend_params)
540
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
541
        assert isinstance(hits, ListSuggestionResult), \
542
            "Optimize should only be done with ListSuggestionResult " + \
543
            "as it would be very slow with VectorSuggestionResult."
544
        for hit_filter, batch in filter_batches.values():
545
            batch.evaluate(hit_filter(hits), doc.subject_set)
546
        ndocs += 1
547
548
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
549
550
    best_scores = collections.defaultdict(float)
551
    best_params = {}
552
553
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
554
    # Store the batches in a list that gets consumed along the way
555
    # This way GC will have a chance to reclaim the memory
556
    filter_batches = list(filter_batches.items())
557
    while filter_batches:
558
        params, filter_batch = filter_batches.pop(0)
559
        metrics = ['Precision (doc avg)',
560
                   'Recall (doc avg)',
561
                   'F1 score (doc avg)']
562
        results = filter_batch[1].results(metrics=metrics)
563
        for metric, score in results.items():
564
            if score >= best_scores[metric]:
565
                best_scores[metric] = score
566
                best_params[metric] = params
567
        click.echo(
568
            template.format(
569
                params[0],
570
                params[1],
571
                results['Precision (doc avg)'],
572
                results['Recall (doc avg)'],
573
                results['F1 score (doc avg)']))
574
575
    click.echo()
576
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
577
    for metric in metrics:
578
        click.echo(
579
            template2.format(
580
                metric,
581
                best_scores[metric],
582
                best_params[metric][0],
583
                best_params[metric][1]))
584
    click.echo("Documents evaluated:\t{}".format(ndocs))
585
586
587
@cli.command('hyperopt')
588
@click.argument('project_id')
589
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
590
@click.option('--docs-limit', '-d', default=None,
591
              type=click.IntRange(0, None),
592
              help='Maximum number of documents to use')
593
@click.option('--trials', '-T', default=10, help='Number of trials')
594
@click.option('--jobs',
595
              '-j',
596
              default=1,
597
              help='Number of parallel runs (0 means all CPUs)')
598
@click.option('--metric', '-m', default='NDCG',
599
              help='Metric to optimize (default: NDCG)')
600
@click.option(
601
    '--results-file',
602
    '-r',
603
    type=click.File(
604
        'w',
605
        encoding='utf-8',
606
        errors='ignore',
607
        lazy=True),
608
    help="""Specify file path to write trial results as CSV.
609
    File directory must exist, existing file will be overwritten.""")
610
@common_options
611
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
612
                 results_file):
613
    """
614
    Optimize the hyperparameters of a project using validation documents from
615
    ``PATHS``. Not supported by all backends. Output is a list of trial results
616
    and a report of the best performing parameters.
617
    """
618
    proj = get_project(project_id)
619
    documents = open_documents(paths, proj.subjects,
620
                               proj.vocab_lang, docs_limit)
621
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
622
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
623
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
624
    click.echo("---")
625
    for line in rec.lines:
626
        click.echo(line)
627
    click.echo("---")
628
629
630
if __name__ == '__main__':
631
    cli()
632