Passed
Push — master ( c8c370...dee89b )
by Osma
03:14
created

annif.cli.run_analyzedir()   B

Complexity

Conditions 7

Size

Total Lines 39
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 34
dl 0
loc 39
rs 7.664
c 0
b 0
f 0
cc 7
nop 7
1
"""Definitions for command-line (Click) commands for invoking Annif
2
operations and printing the results to console."""
3
4
5
import collections
6
import os.path
7
import re
8
import sys
9
import click
10
import click_log
11
from flask.cli import FlaskGroup
12
import annif
13
import annif.corpus
14
import annif.eval
15
import annif.project
16
from annif.project import Access
17
from annif.suggestion import SuggestionFilter
18
19
logger = annif.logger
20
click_log.basic_config(logger)
21
22
cli = FlaskGroup(create_app=annif.create_app)
23
24
25
def get_project(project_id):
26
    """
27
    Helper function to get a project by ID and bail out if it doesn't exist"""
28
    try:
29
        return annif.project.get_project(project_id, min_access=Access.hidden)
30
    except ValueError:
31
        click.echo(
32
            "No projects found with id \'{0}\'.".format(project_id),
33
            err=True)
34
        sys.exit(1)
35
36
37
def open_documents(paths):
38
    """Helper function to open a document corpus from a list of pathnames,
39
    each of which is either a TSV file or a directory of TXT files. The
40
    corpus will be returned as an instance of DocumentCorpus."""
41
42
    def open_doc_path(path):
43
        """open a single path and return it as a DocumentCorpus"""
44
        if os.path.isdir(path):
45
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
46
        return annif.corpus.DocumentFile(path)
47
48
    if len(paths) > 1:
49
        corpora = [open_doc_path(path) for path in paths]
50
        docs = annif.corpus.CombinedCorpus(corpora)
51
    else:
52
        docs = open_doc_path(paths[0])
53
    return docs
54
55
56
def parse_backend_params(backend_param):
57
    """Parse a list of backend parameters given with the --backend-param
58
    option into a nested dict structure"""
59
    backend_params = collections.defaultdict(dict)
60
    for beparam in backend_param:
61
        backend, param = beparam.split('.', 1)
62
        key, val = param.split('=', 1)
63
        backend_params[backend][key] = val
64
    return backend_params
65
66
67
def generate_filter_batches(subjects):
68
    filter_batches = collections.OrderedDict()
69
    for limit in range(1, 16):
70
        for threshold in [i * 0.05 for i in range(20)]:
71
            hit_filter = SuggestionFilter(limit, threshold)
72
            batch = annif.eval.EvaluationBatch(subjects)
73
            filter_batches[(limit, threshold)] = (hit_filter, batch)
74
    return filter_batches
75
76
77
@cli.command('list-projects')
78
def run_list_projects():
79
    """
80
    List available projects.
81
    """
82
83
    template = "{0: <25}{1: <45}{2: <8}"
84
    header = template.format("Project ID", "Project Name", "Language")
85
    click.echo(header)
86
    click.echo("-" * len(header))
87
    for proj in annif.project.get_projects(min_access=Access.private).values():
88
        click.echo(template.format(proj.project_id, proj.name, proj.language))
89
90
91
@cli.command('show-project')
92
@click.argument('project_id')
93
def run_show_project(project_id):
94
    """
95
    Show information about a project.
96
    """
97
98
    proj = get_project(project_id)
99
    template = "{0:<20}{1}"
100
    click.echo(template.format('Project ID:', proj.project_id))
101
    click.echo(template.format('Project Name:', proj.name))
102
    click.echo(template.format('Language:', proj.language))
103
    click.echo(template.format('Access:', proj.access.name))
104
105
106
@cli.command('loadvoc')
107
@click_log.simple_verbosity_option(logger)
108
@click.argument('project_id')
109
@click.argument('subjectfile', type=click.Path(dir_okay=False))
110
def run_loadvoc(project_id, subjectfile):
111
    """
112
    Load a vocabulary for a project.
113
    """
114
    proj = get_project(project_id)
115
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
116
        # SKOS/RDF file supported by rdflib
117
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
118
    else:
119
        # probably a TSV file
120
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
121
    proj.vocab.load_vocabulary(subjects)
122
123
124
@cli.command('train')
125
@click_log.simple_verbosity_option(logger)
126
@click.argument('project_id')
127
@click.argument('paths', type=click.Path(), nargs=-1)
128
def run_train(project_id, paths):
129
    """
130
    Train a project on a collection of documents.
131
    """
132
    proj = get_project(project_id)
133
    documents = open_documents(paths)
134
    proj.train(documents)
135
136
137
@cli.command('learn')
138
@click_log.simple_verbosity_option(logger)
139
@click.argument('project_id')
140
@click.argument('paths', type=click.Path(), nargs=-1)
141
def run_learn(project_id, paths):
142
    """
143
    Further train an existing project on a collection of documents.
144
    """
145
    proj = get_project(project_id)
146
    documents = open_documents(paths)
147
    proj.learn(documents)
148
149
150
@cli.command('suggest')
151
@click_log.simple_verbosity_option(logger)
152
@click.argument('project_id')
153
@click.option('--limit', default=10, help='Maximum number of subjects')
154
@click.option('--threshold', default=0.0, help='Minimum score threshold')
155
@click.option('--backend-param', '-b', multiple=True,
156
              help='Backend parameters to override')
157
def run_suggest(project_id, limit, threshold, backend_param):
158
    """
159
    Suggest subjects for a single document from standard input.
160
    """
161
    project = get_project(project_id)
162
    text = sys.stdin.read()
163
    backend_params = parse_backend_params(backend_param)
164
    hit_filter = SuggestionFilter(limit, threshold)
165
    hits = hit_filter(project.suggest(text, backend_params))
166
    for hit in hits:
167
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
168
169
170
@cli.command('index')
171
@click_log.simple_verbosity_option(logger)
172
@click.argument('project_id')
173
@click.argument('directory', type=click.Path(file_okay=False))
174
@click.option(
175
    '--suffix',
176
    default='.annif',
177
    help='File name suffix for result files')
178
@click.option('--force/--no-force', default=False,
179
              help='Force overwriting of existing result files')
180
@click.option('--limit', default=10, help='Maximum number of subjects')
181
@click.option('--threshold', default=0.0, help='Minimum score threshold')
182
@click.option('--backend-param', '-b', multiple=True,
183
              help='Backend parameters to override')
184
def run_index(project_id, directory, suffix, force,
185
              limit, threshold, backend_param):
186
    """
187
    Index a directory with documents, suggesting subjects for each document.
188
    Write the results in TSV files with the given suffix.
189
    """
190
    project = get_project(project_id)
191
    backend_params = parse_backend_params(backend_param)
192
    hit_filter = SuggestionFilter(limit, threshold)
193
194
    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
195
            directory, require_subjects=False):
196
        with open(docfilename, encoding='utf-8') as docfile:
197
            text = docfile.read()
198
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
199
        if os.path.exists(subjectfilename) and not force:
200
            click.echo(
201
                "Not overwriting {} (use --force to override)".format(
202
                    subjectfilename))
203
            continue
204
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
205
            results = project.suggest(text, backend_params)
206
            for hit in hit_filter(results):
207
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
208
                click.echo(line, file=subjfile)
209
210
211
@cli.command('eval')
212
@click_log.simple_verbosity_option(logger)
213
@click.argument('project_id')
214
@click.argument('paths', type=click.Path(), nargs=-1)
215
@click.option('--limit', default=10, help='Maximum number of subjects')
216
@click.option('--threshold', default=0.0, help='Minimum score threshold')
217
@click.option('--backend-param', '-b', multiple=True,
218
              help='Backend parameters to override')
219
def run_eval(project_id, paths, limit, threshold, backend_param):
220
    """
221
    Analyze documents and evaluate the result.
222
223
    Compare the results of automated indexing against a gold standard. The
224
    path may be either a TSV file with short documents or a directory with
225
    documents in separate files.
226
    """
227
    project = get_project(project_id)
228
    backend_params = parse_backend_params(backend_param)
229
230
    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
231
    eval_batch = annif.eval.EvaluationBatch(project.subjects)
232
233
    docs = open_documents(paths)
234
    for doc in docs.documents:
235
        results = project.suggest(doc.text, backend_params)
236
        hits = hit_filter(results)
237
        eval_batch.evaluate(hits,
238
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))
239
240
    template = "{0:<20}\t{1}"
241
    for metric, score in eval_batch.results().items():
242
        click.echo(template.format(metric + ":", score))
243
244
245
@cli.command('optimize')
246
@click_log.simple_verbosity_option(logger)
247
@click.argument('project_id')
248
@click.argument('paths', type=click.Path(), nargs=-1)
249
@click.option('--backend-param', '-b', multiple=True,
250
              help='Backend parameters to override')
251
def run_optimize(project_id, paths, backend_param):
252
    """
253
    Analyze documents, testing multiple limits and thresholds.
254
255
    Evaluate the analysis results for a directory with documents against a
256
    gold standard given in subject files. Test different limit/threshold
257
    values and report the precision, recall and F-measure of each combination
258
    of settings.
259
    """
260
    project = get_project(project_id)
261
    backend_params = parse_backend_params(backend_param)
262
263
    filter_batches = generate_filter_batches(project.subjects)
264
265
    ndocs = 0
266
    docs = open_documents(paths)
267
    for doc in docs.documents:
268
        hits = project.suggest(doc.text, backend_params)
269
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
270
        for hit_filter, batch in filter_batches.values():
271
            batch.evaluate(hit_filter(hits), gold_subjects)
272
        ndocs += 1
273
274
    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
275
276
    best_scores = collections.defaultdict(float)
277
    best_params = {}
278
279
    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
280
    # Store the batches in a list that gets consumed along the way
281
    # This way GC will have a chance to reclaim the memory
282
    filter_batches = list(filter_batches.items())
283
    while filter_batches:
284
        params, filter_batch = filter_batches.pop(0)
285
        results = filter_batch[1].results(metrics='simple')
286
        for metric, score in results.items():
287
            if score >= best_scores[metric]:
288
                best_scores[metric] = score
289
                best_params[metric] = params
290
        click.echo(
291
            template.format(
292
                params[0],
293
                params[1],
294
                results['Precision (doc avg)'],
295
                results['Recall (doc avg)'],
296
                results['F1 score (doc avg)']))
297
298
    click.echo()
299
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
300
    for metric in ('Precision (doc avg)',
301
                   'Recall (doc avg)',
302
                   'F1 score (doc avg)',
303
                   'NDCG@5',
304
                   'NDCG@10'):
305
        click.echo(
306
            template2.format(
307
                metric,
308
                best_scores[metric],
309
                best_params[metric][0],
310
                best_params[metric][1]))
311
    click.echo("Documents evaluated:\t{}".format(ndocs))
312
313
314
if __name__ == '__main__':
315
    cli()
316