|
1
|
|
|
"""Definitions for command-line (Click) commands for invoking Annif |
|
2
|
|
|
operations and printing the results to console.""" |
|
3
|
|
|
|
|
4
|
|
|
|
|
5
|
|
|
import collections |
|
6
|
|
|
import os.path |
|
7
|
|
|
import re |
|
8
|
|
|
import sys |
|
9
|
|
|
import click |
|
10
|
|
|
import click_log |
|
11
|
|
|
from flask import current_app |
|
12
|
|
|
from flask.cli import FlaskGroup, ScriptInfo |
|
13
|
|
|
import annif |
|
14
|
|
|
import annif.corpus |
|
15
|
|
|
import annif.eval |
|
16
|
|
|
import annif.project |
|
17
|
|
|
from annif.project import Access |
|
18
|
|
|
from annif.suggestion import SuggestionFilter |
|
19
|
|
|
|
|
20
|
|
|
logger = annif.logger |
|
21
|
|
|
click_log.basic_config(logger) |
|
22
|
|
|
|
|
23
|
|
|
cli = FlaskGroup(create_app=annif.create_app) |
|
24
|
|
|
|
|
25
|
|
|
|
|
26
|
|
|
def get_project(project_id): |
|
27
|
|
|
""" |
|
28
|
|
|
Helper function to get a project by ID and bail out if it doesn't exist""" |
|
29
|
|
|
try: |
|
30
|
|
|
return annif.project.get_project(project_id, min_access=Access.hidden) |
|
31
|
|
|
except ValueError: |
|
32
|
|
|
click.echo( |
|
33
|
|
|
"No projects found with id \'{0}\'.".format(project_id), |
|
34
|
|
|
err=True) |
|
35
|
|
|
sys.exit(1) |
|
36
|
|
|
|
|
37
|
|
|
|
|
38
|
|
|
def open_documents(paths): |
|
39
|
|
|
"""Helper function to open a document corpus from a list of pathnames, |
|
40
|
|
|
each of which is either a TSV file or a directory of TXT files. The |
|
41
|
|
|
corpus will be returned as an instance of DocumentCorpus.""" |
|
42
|
|
|
|
|
43
|
|
|
def open_doc_path(path): |
|
44
|
|
|
"""open a single path and return it as a DocumentCorpus""" |
|
45
|
|
|
if os.path.isdir(path): |
|
46
|
|
|
return annif.corpus.DocumentDirectory(path, require_subjects=True) |
|
47
|
|
|
return annif.corpus.DocumentFile(path) |
|
48
|
|
|
|
|
49
|
|
|
if len(paths) > 1: |
|
50
|
|
|
corpora = [open_doc_path(path) for path in paths] |
|
51
|
|
|
docs = annif.corpus.CombinedCorpus(corpora) |
|
52
|
|
|
else: |
|
53
|
|
|
docs = open_doc_path(paths[0]) |
|
54
|
|
|
return docs |
|
55
|
|
|
|
|
56
|
|
|
|
|
57
|
|
|
def parse_backend_params(backend_param): |
|
58
|
|
|
"""Parse a list of backend parameters given with the --backend-param |
|
59
|
|
|
option into a nested dict structure""" |
|
60
|
|
|
backend_params = collections.defaultdict(dict) |
|
61
|
|
|
for beparam in backend_param: |
|
62
|
|
|
backend, param = beparam.split('.', 1) |
|
63
|
|
|
key, val = param.split('=', 1) |
|
64
|
|
|
backend_params[backend][key] = val |
|
65
|
|
|
return backend_params |
|
66
|
|
|
|
|
67
|
|
|
|
|
68
|
|
|
def generate_filter_batches(subjects): |
|
69
|
|
|
filter_batches = collections.OrderedDict() |
|
70
|
|
|
for limit in range(1, 16): |
|
71
|
|
|
for threshold in [i * 0.05 for i in range(20)]: |
|
72
|
|
|
hit_filter = SuggestionFilter(limit, threshold) |
|
73
|
|
|
batch = annif.eval.EvaluationBatch(subjects) |
|
74
|
|
|
filter_batches[(limit, threshold)] = (hit_filter, batch) |
|
75
|
|
|
return filter_batches |
|
76
|
|
|
|
|
77
|
|
|
|
|
78
|
|
|
def set_project_config_file_path(ctx, param, value): |
|
79
|
|
|
"""Override the default path or the path given in env by CLI option""" |
|
80
|
|
|
with ctx.ensure_object(ScriptInfo).load_app().app_context(): |
|
81
|
|
|
if value: |
|
82
|
|
|
current_app.config['PROJECTS_FILE'] = value |
|
83
|
|
|
|
|
84
|
|
|
|
|
85
|
|
|
def common_options(f): |
|
86
|
|
|
"""Decorator to add common options for all CLI commands""" |
|
87
|
|
|
f = click.option( |
|
88
|
|
|
'-p', '--projects', help='Set path to projects.cfg', |
|
89
|
|
|
callback=set_project_config_file_path, expose_value=False, |
|
90
|
|
|
is_eager=True)(f) |
|
91
|
|
|
f = click_log.simple_verbosity_option(logger)(f) |
|
92
|
|
|
return f |
|
93
|
|
|
|
|
94
|
|
|
|
|
95
|
|
|
@cli.command('list-projects') |
|
96
|
|
|
@common_options |
|
97
|
|
|
def run_list_projects(): |
|
98
|
|
|
""" |
|
99
|
|
|
List available projects. |
|
100
|
|
|
""" |
|
101
|
|
|
|
|
102
|
|
|
template = "{0: <25}{1: <45}{2: <8}" |
|
103
|
|
|
header = template.format("Project ID", "Project Name", "Language") |
|
104
|
|
|
click.echo(header) |
|
105
|
|
|
click.echo("-" * len(header)) |
|
106
|
|
|
for proj in annif.project.get_projects(min_access=Access.private).values(): |
|
107
|
|
|
click.echo(template.format(proj.project_id, proj.name, proj.language)) |
|
108
|
|
|
|
|
109
|
|
|
|
|
110
|
|
|
@cli.command('show-project') |
|
111
|
|
|
@click.argument('project_id') |
|
112
|
|
|
@common_options |
|
113
|
|
|
def run_show_project(project_id): |
|
114
|
|
|
""" |
|
115
|
|
|
Show information about a project. |
|
116
|
|
|
""" |
|
117
|
|
|
|
|
118
|
|
|
proj = get_project(project_id) |
|
119
|
|
|
template = "{0:<20}{1}" |
|
120
|
|
|
click.echo(template.format('Project ID:', proj.project_id)) |
|
121
|
|
|
click.echo(template.format('Project Name:', proj.name)) |
|
122
|
|
|
click.echo(template.format('Language:', proj.language)) |
|
123
|
|
|
click.echo(template.format('Access:', proj.access.name)) |
|
124
|
|
|
|
|
125
|
|
|
|
|
126
|
|
|
@cli.command('clear') |
|
127
|
|
|
@click.argument('project_id') |
|
128
|
|
|
@common_options |
|
129
|
|
|
def run_clear_project(project_id): |
|
130
|
|
|
""" |
|
131
|
|
|
Initialize the project to its original, untrained state. |
|
132
|
|
|
""" |
|
133
|
|
|
proj = get_project(project_id) |
|
134
|
|
|
proj.remove_model_data() |
|
135
|
|
|
|
|
136
|
|
|
|
|
137
|
|
|
@cli.command('loadvoc') |
|
138
|
|
|
@click.argument('project_id') |
|
139
|
|
|
@click.argument('subjectfile', type=click.Path(dir_okay=False)) |
|
140
|
|
|
@common_options |
|
141
|
|
|
def run_loadvoc(project_id, subjectfile): |
|
142
|
|
|
""" |
|
143
|
|
|
Load a vocabulary for a project. |
|
144
|
|
|
""" |
|
145
|
|
|
proj = get_project(project_id) |
|
146
|
|
|
if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile): |
|
147
|
|
|
# SKOS/RDF file supported by rdflib |
|
148
|
|
|
subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language) |
|
149
|
|
|
else: |
|
150
|
|
|
# probably a TSV file |
|
151
|
|
|
subjects = annif.corpus.SubjectFileTSV(subjectfile) |
|
152
|
|
|
proj.vocab.load_vocabulary(subjects) |
|
153
|
|
|
|
|
154
|
|
|
|
|
155
|
|
|
@cli.command('train') |
|
156
|
|
|
@click.argument('project_id') |
|
157
|
|
|
@click.argument('paths', type=click.Path(), nargs=-1) |
|
158
|
|
|
@common_options |
|
159
|
|
|
def run_train(project_id, paths): |
|
160
|
|
|
""" |
|
161
|
|
|
Train a project on a collection of documents. |
|
162
|
|
|
""" |
|
163
|
|
|
proj = get_project(project_id) |
|
164
|
|
|
documents = open_documents(paths) |
|
165
|
|
|
proj.train(documents) |
|
166
|
|
|
|
|
167
|
|
|
|
|
168
|
|
|
@cli.command('learn') |
|
169
|
|
|
@click.argument('project_id') |
|
170
|
|
|
@click.argument('paths', type=click.Path(), nargs=-1) |
|
171
|
|
|
@common_options |
|
172
|
|
|
def run_learn(project_id, paths): |
|
173
|
|
|
""" |
|
174
|
|
|
Further train an existing project on a collection of documents. |
|
175
|
|
|
""" |
|
176
|
|
|
proj = get_project(project_id) |
|
177
|
|
|
documents = open_documents(paths) |
|
178
|
|
|
proj.learn(documents) |
|
179
|
|
|
|
|
180
|
|
|
|
|
181
|
|
|
@cli.command('suggest') |
|
182
|
|
|
@click.argument('project_id') |
|
183
|
|
|
@click.option('--limit', default=10, help='Maximum number of subjects') |
|
184
|
|
|
@click.option('--threshold', default=0.0, help='Minimum score threshold') |
|
185
|
|
|
@click.option('--backend-param', '-b', multiple=True, |
|
186
|
|
|
help='Backend parameters to override') |
|
187
|
|
|
@common_options |
|
188
|
|
|
def run_suggest(project_id, limit, threshold, backend_param): |
|
189
|
|
|
""" |
|
190
|
|
|
Suggest subjects for a single document from standard input. |
|
191
|
|
|
""" |
|
192
|
|
|
project = get_project(project_id) |
|
193
|
|
|
text = sys.stdin.read() |
|
194
|
|
|
backend_params = parse_backend_params(backend_param) |
|
195
|
|
|
hit_filter = SuggestionFilter(limit, threshold) |
|
196
|
|
|
hits = hit_filter(project.suggest(text, backend_params)) |
|
197
|
|
|
for hit in hits: |
|
198
|
|
|
click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)) |
|
199
|
|
|
|
|
200
|
|
|
|
|
201
|
|
|
@cli.command('index') |
|
202
|
|
|
@click.argument('project_id') |
|
203
|
|
|
@click.argument('directory', type=click.Path(file_okay=False)) |
|
204
|
|
|
@click.option( |
|
205
|
|
|
'--suffix', |
|
206
|
|
|
default='.annif', |
|
207
|
|
|
help='File name suffix for result files') |
|
208
|
|
|
@click.option('--force/--no-force', default=False, |
|
209
|
|
|
help='Force overwriting of existing result files') |
|
210
|
|
|
@click.option('--limit', default=10, help='Maximum number of subjects') |
|
211
|
|
|
@click.option('--threshold', default=0.0, help='Minimum score threshold') |
|
212
|
|
|
@click.option('--backend-param', '-b', multiple=True, |
|
213
|
|
|
help='Backend parameters to override') |
|
214
|
|
|
@common_options |
|
215
|
|
|
def run_index(project_id, directory, suffix, force, |
|
216
|
|
|
limit, threshold, backend_param): |
|
217
|
|
|
""" |
|
218
|
|
|
Index a directory with documents, suggesting subjects for each document. |
|
219
|
|
|
Write the results in TSV files with the given suffix. |
|
220
|
|
|
""" |
|
221
|
|
|
project = get_project(project_id) |
|
222
|
|
|
backend_params = parse_backend_params(backend_param) |
|
223
|
|
|
hit_filter = SuggestionFilter(limit, threshold) |
|
224
|
|
|
|
|
225
|
|
|
for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( |
|
226
|
|
|
directory, require_subjects=False): |
|
227
|
|
|
with open(docfilename, encoding='utf-8') as docfile: |
|
228
|
|
|
text = docfile.read() |
|
229
|
|
|
subjectfilename = re.sub(r'\.txt$', suffix, docfilename) |
|
230
|
|
|
if os.path.exists(subjectfilename) and not force: |
|
231
|
|
|
click.echo( |
|
232
|
|
|
"Not overwriting {} (use --force to override)".format( |
|
233
|
|
|
subjectfilename)) |
|
234
|
|
|
continue |
|
235
|
|
|
with open(subjectfilename, 'w', encoding='utf-8') as subjfile: |
|
236
|
|
|
results = project.suggest(text, backend_params) |
|
237
|
|
|
for hit in hit_filter(results): |
|
238
|
|
|
line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score) |
|
239
|
|
|
click.echo(line, file=subjfile) |
|
240
|
|
|
|
|
241
|
|
|
|
|
242
|
|
|
@cli.command('eval') |
|
243
|
|
|
@click.argument('project_id') |
|
244
|
|
|
@click.argument('paths', type=click.Path(), nargs=-1) |
|
245
|
|
|
@click.option('--limit', default=10, help='Maximum number of subjects') |
|
246
|
|
|
@click.option('--threshold', default=0.0, help='Minimum score threshold') |
|
247
|
|
|
@click.option('--backend-param', '-b', multiple=True, |
|
248
|
|
|
help='Backend parameters to override') |
|
249
|
|
|
@common_options |
|
250
|
|
|
def run_eval(project_id, paths, limit, threshold, backend_param): |
|
251
|
|
|
""" |
|
252
|
|
|
Analyze documents and evaluate the result. |
|
253
|
|
|
|
|
254
|
|
|
Compare the results of automated indexing against a gold standard. The |
|
255
|
|
|
path may be either a TSV file with short documents or a directory with |
|
256
|
|
|
documents in separate files. |
|
257
|
|
|
""" |
|
258
|
|
|
project = get_project(project_id) |
|
259
|
|
|
backend_params = parse_backend_params(backend_param) |
|
260
|
|
|
|
|
261
|
|
|
hit_filter = SuggestionFilter(limit=limit, threshold=threshold) |
|
262
|
|
|
eval_batch = annif.eval.EvaluationBatch(project.subjects) |
|
263
|
|
|
|
|
264
|
|
|
docs = open_documents(paths) |
|
265
|
|
|
for doc in docs.documents: |
|
266
|
|
|
results = project.suggest(doc.text, backend_params) |
|
267
|
|
|
hits = hit_filter(results) |
|
268
|
|
|
eval_batch.evaluate(hits, |
|
269
|
|
|
annif.corpus.SubjectSet((doc.uris, doc.labels))) |
|
270
|
|
|
|
|
271
|
|
|
template = "{0:<20}\t{1}" |
|
272
|
|
|
for metric, score in eval_batch.results().items(): |
|
273
|
|
|
click.echo(template.format(metric + ":", score)) |
|
274
|
|
|
|
|
275
|
|
|
|
|
276
|
|
|
@cli.command('optimize') |
|
277
|
|
|
@click.argument('project_id') |
|
278
|
|
|
@click.argument('paths', type=click.Path(), nargs=-1) |
|
279
|
|
|
@click.option('--backend-param', '-b', multiple=True, |
|
280
|
|
|
help='Backend parameters to override') |
|
281
|
|
|
@common_options |
|
282
|
|
|
def run_optimize(project_id, paths, backend_param): |
|
283
|
|
|
""" |
|
284
|
|
|
Analyze documents, testing multiple limits and thresholds. |
|
285
|
|
|
|
|
286
|
|
|
Evaluate the analysis results for a directory with documents against a |
|
287
|
|
|
gold standard given in subject files. Test different limit/threshold |
|
288
|
|
|
values and report the precision, recall and F-measure of each combination |
|
289
|
|
|
of settings. |
|
290
|
|
|
""" |
|
291
|
|
|
project = get_project(project_id) |
|
292
|
|
|
backend_params = parse_backend_params(backend_param) |
|
293
|
|
|
|
|
294
|
|
|
filter_batches = generate_filter_batches(project.subjects) |
|
295
|
|
|
|
|
296
|
|
|
ndocs = 0 |
|
297
|
|
|
docs = open_documents(paths) |
|
298
|
|
|
for doc in docs.documents: |
|
299
|
|
|
hits = project.suggest(doc.text, backend_params) |
|
300
|
|
|
gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) |
|
301
|
|
|
for hit_filter, batch in filter_batches.values(): |
|
302
|
|
|
batch.evaluate(hit_filter(hits), gold_subjects) |
|
303
|
|
|
ndocs += 1 |
|
304
|
|
|
|
|
305
|
|
|
click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1'))) |
|
306
|
|
|
|
|
307
|
|
|
best_scores = collections.defaultdict(float) |
|
308
|
|
|
best_params = {} |
|
309
|
|
|
|
|
310
|
|
|
template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}" |
|
311
|
|
|
# Store the batches in a list that gets consumed along the way |
|
312
|
|
|
# This way GC will have a chance to reclaim the memory |
|
313
|
|
|
filter_batches = list(filter_batches.items()) |
|
314
|
|
|
while filter_batches: |
|
315
|
|
|
params, filter_batch = filter_batches.pop(0) |
|
316
|
|
|
results = filter_batch[1].results(metrics='simple') |
|
317
|
|
|
for metric, score in results.items(): |
|
318
|
|
|
if score >= best_scores[metric]: |
|
319
|
|
|
best_scores[metric] = score |
|
320
|
|
|
best_params[metric] = params |
|
321
|
|
|
click.echo( |
|
322
|
|
|
template.format( |
|
323
|
|
|
params[0], |
|
324
|
|
|
params[1], |
|
325
|
|
|
results['Precision (doc avg)'], |
|
326
|
|
|
results['Recall (doc avg)'], |
|
327
|
|
|
results['F1 score (doc avg)'])) |
|
328
|
|
|
|
|
329
|
|
|
click.echo() |
|
330
|
|
|
template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}" |
|
331
|
|
|
for metric in ('Precision (doc avg)', |
|
332
|
|
|
'Recall (doc avg)', |
|
333
|
|
|
'F1 score (doc avg)', |
|
334
|
|
|
'NDCG@5', |
|
335
|
|
|
'NDCG@10'): |
|
336
|
|
|
click.echo( |
|
337
|
|
|
template2.format( |
|
338
|
|
|
metric, |
|
339
|
|
|
best_scores[metric], |
|
340
|
|
|
best_params[metric][0], |
|
341
|
|
|
best_params[metric][1])) |
|
342
|
|
|
click.echo("Documents evaluated:\t{}".format(ndocs)) |
|
343
|
|
|
|
|
344
|
|
|
|
|
345
|
|
|
if __name__ == '__main__': |
|
346
|
|
|
cli() |
|
347
|
|
|
|