Passed
Push — master ( 93695b...d2dff1 )
by Osma
04:07 queued 11s
created

annif.cli.run_clear_project()   A

Complexity

Conditions 1

Size

Total Lines 9
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 9
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import click
10
import click_log
11
from flask import current_app
12
from flask.cli import FlaskGroup, ScriptInfo
13
import annif
14
import annif.corpus
15
import annif.eval
16
import annif.project
17
from annif.project import Access
18
from annif.suggestion import SuggestionFilter
19
20
logger = annif.logger
21
click_log.basic_config(logger)
22
23
cli = FlaskGroup(create_app=annif.create_app)
24
25
26
def get_project(project_id):
27
    """
28
    Helper function to get a project by ID and bail out if it doesn't exist"""
29
    try:
30
        return annif.project.get_project(project_id, min_access=Access.hidden)
31
    except ValueError:
32
        click.echo(
33
            "No projects found with id \'{0}\'.".format(project_id),
34
            err=True)
35
        sys.exit(1)
36
37
38
def open_documents(paths):
39
    """Helper function to open a document corpus from a list of pathnames,
40
    each of which is either a TSV file or a directory of TXT files. The
41
    corpus will be returned as an instance of DocumentCorpus."""
42
43
    def open_doc_path(path):
44
        """open a single path and return it as a DocumentCorpus"""
45
        if os.path.isdir(path):
46
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
47
        return annif.corpus.DocumentFile(path)
48
49
    if len(paths) > 1:
50
        corpora = [open_doc_path(path) for path in paths]
51
        docs = annif.corpus.CombinedCorpus(corpora)
52
    else:
53
        docs = open_doc_path(paths[0])
54
    return docs
55
56
57
def parse_backend_params(backend_param):
58
    """Parse a list of backend parameters given with the --backend-param
59
    option into a nested dict structure"""
60
    backend_params = collections.defaultdict(dict)
61
    for beparam in backend_param:
62
        backend, param = beparam.split('.', 1)
63
        key, val = param.split('=', 1)
64
        backend_params[backend][key] = val
65
    return backend_params
66
67
68
def generate_filter_batches(subjects):
69
    filter_batches = collections.OrderedDict()
70
    for limit in range(1, 16):
71
        for threshold in [i * 0.05 for i in range(20)]:
72
            hit_filter = SuggestionFilter(limit, threshold)
73
            batch = annif.eval.EvaluationBatch(subjects)
74
            filter_batches[(limit, threshold)] = (hit_filter, batch)
75
    return filter_batches
76
77
78
def set_project_config_file_path(ctx, param, value):
79
    """Override the default path or the path given in env by CLI option"""
80
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
81
        if value:
82
            current_app.config['PROJECTS_FILE'] = value
83
84
85
def common_options(f):
86
    """Decorator to add common options for all CLI commands"""
87
    f = click.option(
88
        '-p', '--projects', help='Set path to projects.cfg',
89
        callback=set_project_config_file_path, expose_value=False,
90
        is_eager=True)(f)
91
    f = click_log.simple_verbosity_option(logger)(f)
92
    return f
93
94
95
@cli.command('list-projects')
96
@common_options
97
def run_list_projects():
98
    """
99
    List available projects.
100
    """
101
102
    template = "{0: <25}{1: <45}{2: <8}"
103
    header = template.format("Project ID", "Project Name", "Language")
104
    click.echo(header)
105
    click.echo("-" * len(header))
106
    for proj in annif.project.get_projects(min_access=Access.private).values():
107
        click.echo(template.format(proj.project_id, proj.name, proj.language))
108
109
110
@cli.command('show-project')
111
@click.argument('project_id')
112
@common_options
113
def run_show_project(project_id):
114
    """
115
    Show information about a project.
116
    """
117
118
    proj = get_project(project_id)
119
    template = "{0:<20}{1}"
120
    click.echo(template.format('Project ID:', proj.project_id))
121
    click.echo(template.format('Project Name:', proj.name))
122
    click.echo(template.format('Language:', proj.language))
123
    click.echo(template.format('Access:', proj.access.name))
124
125
126
@cli.command('clear')
127
@click.argument('project_id')
128
@common_options
129
def run_clear_project(project_id):
130
    """
131
    Initialize the project to its original, untrained state.
132
    """
133
    proj = get_project(project_id)
134
    proj.remove_model_data()
135
136
137
@cli.command('loadvoc')
138
@click.argument('project_id')
139
@click.argument('subjectfile', type=click.Path(dir_okay=False))
140
@common_options
141
def run_loadvoc(project_id, subjectfile):
142
    """
143
    Load a vocabulary for a project.
144
    """
145
    proj = get_project(project_id)
146
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
147
        # SKOS/RDF file supported by rdflib
148
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
149
    else:
150
        # probably a TSV file
151
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
152
    proj.vocab.load_vocabulary(subjects)
153
154
155
@cli.command('train')
156
@click.argument('project_id')
157
@click.argument('paths', type=click.Path(), nargs=-1)
158
@common_options
159
def run_train(project_id, paths):
160
    """
161
    Train a project on a collection of documents.
162
    """
163
    proj = get_project(project_id)
164
    documents = open_documents(paths)
165
    proj.train(documents)
166
167
168
@cli.command('learn')
169
@click.argument('project_id')
170
@click.argument('paths', type=click.Path(), nargs=-1)
171
@common_options
172
def run_learn(project_id, paths):
173
    """
174
    Further train an existing project on a collection of documents.
175
    """
176
    proj = get_project(project_id)
177
    documents = open_documents(paths)
178
    proj.learn(documents)
179
180
181
@cli.command('suggest')
182
@click.argument('project_id')
183
@click.option('--limit', default=10, help='Maximum number of subjects')
184
@click.option('--threshold', default=0.0, help='Minimum score threshold')
185
@click.option('--backend-param', '-b', multiple=True,
186
              help='Backend parameters to override')
187
@common_options
188
def run_suggest(project_id, limit, threshold, backend_param):
189
    """
190
    Suggest subjects for a single document from standard input.
191
    """
192
    project = get_project(project_id)
193
    text = sys.stdin.read()
194
    backend_params = parse_backend_params(backend_param)
195
    hit_filter = SuggestionFilter(limit, threshold)
196
    hits = hit_filter(project.suggest(text, backend_params))
197
    for hit in hits:
198
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
199
200
201
@cli.command('index')
202
@click.argument('project_id')
203
@click.argument('directory', type=click.Path(file_okay=False))
204
@click.option(
205
    '--suffix',
206
    default='.annif',
207
    help='File name suffix for result files')
208
@click.option('--force/--no-force', default=False,
209
              help='Force overwriting of existing result files')
210
@click.option('--limit', default=10, help='Maximum number of subjects')
211
@click.option('--threshold', default=0.0, help='Minimum score threshold')
212
@click.option('--backend-param', '-b', multiple=True,
213
              help='Backend parameters to override')
214
@common_options
215
def run_index(project_id, directory, suffix, force,
216
              limit, threshold, backend_param):
217
    """
218
    Index a directory with documents, suggesting subjects for each document.
219
    Write the results in TSV files with the given suffix.
220
    """
221
    project = get_project(project_id)
222
    backend_params = parse_backend_params(backend_param)
223
    hit_filter = SuggestionFilter(limit, threshold)
224
225
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
226
            directory, require_subjects=False):
227
        with open(docfilename, encoding='utf-8') as docfile:
228
            text = docfile.read()
229
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
230
        if os.path.exists(subjectfilename) and not force:
231
            click.echo(
232
                "Not overwriting {} (use --force to override)".format(
233
                    subjectfilename))
234
            continue
235
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
236
            results = project.suggest(text, backend_params)
237
            for hit in hit_filter(results):
238
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
239
                click.echo(line, file=subjfile)
240
241
242
@cli.command('eval')
243
@click.argument('project_id')
244
@click.argument('paths', type=click.Path(), nargs=-1)
245
@click.option('--limit', default=10, help='Maximum number of subjects')
246
@click.option('--threshold', default=0.0, help='Minimum score threshold')
247
@click.option('--backend-param', '-b', multiple=True,
248
              help='Backend parameters to override')
249
@common_options
250
def run_eval(project_id, paths, limit, threshold, backend_param):
251
    """
252
    Analyze documents and evaluate the result.
253
254
    Compare the results of automated indexing against a gold standard. The
255
    path may be either a TSV file with short documents or a directory with
256
    documents in separate files.
257
    """
258
    project = get_project(project_id)
259
    backend_params = parse_backend_params(backend_param)
260
261
    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
262
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
263
264
    docs = open_documents(paths)
265
    for doc in docs.documents:
266
        results = project.suggest(doc.text, backend_params)
267
        hits = hit_filter(results)
268
        eval_batch.evaluate(hits,
269
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))
270
271
    template = "{0:<20}\t{1}"
272
    for metric, score in eval_batch.results().items():
273
        click.echo(template.format(metric + ":", score))
274
275
276
@cli.command('optimize')
277
@click.argument('project_id')
278
@click.argument('paths', type=click.Path(), nargs=-1)
279
@click.option('--backend-param', '-b', multiple=True,
280
              help='Backend parameters to override')
281
@common_options
282
def run_optimize(project_id, paths, backend_param):
283
    """
284
    Analyze documents, testing multiple limits and thresholds.
285
286
    Evaluate the analysis results for a directory with documents against a
287
    gold standard given in subject files. Test different limit/threshold
288
    values and report the precision, recall and F-measure of each combination
289
    of settings.
290
    """
291
    project = get_project(project_id)
292
    backend_params = parse_backend_params(backend_param)
293
294
    filter_batches = generate_filter_batches(project.subjects)
295
296
    ndocs = 0
297
    docs = open_documents(paths)
298
    for doc in docs.documents:
299
        hits = project.suggest(doc.text, backend_params)
300
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
301
        for hit_filter, batch in filter_batches.values():
302
            batch.evaluate(hit_filter(hits), gold_subjects)
303
        ndocs += 1
304
305
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
306
307
    best_scores = collections.defaultdict(float)
308
    best_params = {}
309
310
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
311
    # Store the batches in a list that gets consumed along the way
312
    # This way GC will have a chance to reclaim the memory
313
    filter_batches = list(filter_batches.items())
314
    while filter_batches:
315
        params, filter_batch = filter_batches.pop(0)
316
        results = filter_batch[1].results(metrics='simple')
317
        for metric, score in results.items():
318
            if score >= best_scores[metric]:
319
                best_scores[metric] = score
320
                best_params[metric] = params
321
        click.echo(
322
            template.format(
323
                params[0],
324
                params[1],
325
                results['Precision (doc avg)'],
326
                results['Recall (doc avg)'],
327
                results['F1 score (doc avg)']))
328
329
    click.echo()
330
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
331
    for metric in ('Precision (doc avg)',
332
                   'Recall (doc avg)',
333
                   'F1 score (doc avg)',
334
                   'NDCG@5',
335
                   'NDCG@10'):
336
        click.echo(
337
            template2.format(
338
                metric,
339
                best_scores[metric],
340
                best_params[metric][0],
341
                best_params[metric][1]))
342
    click.echo("Documents evaluated:\t{}".format(ndocs))
343
344
345
if __name__ == '__main__':
346
    cli()
347