annif.cli.run_analyzedir() - Code Metrics - Inspection of "Merge pull request #270 from NatLibFi/issue267-cli..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c8c370...dee89b )

by Osma

created 2019-04-17 09:24 UTC

annif.cli.run_analyzedir() B

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	39
Code Lines	34

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	34
dl	0
loc	39
rs	7.664
c	0
b	0
f	0
cc	7
nop	7

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import click
import click_log
from flask.cli import FlaskGroup
import annif
import annif.corpus
import annif.eval
import annif.project
from annif.project import Access
from annif.suggestion import SuggestionFilter

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.project.get_project(project_id, min_access=Access.hidden)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths):
    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) > 1:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    else:
        docs = open_doc_path(paths[0])
    return docs


def parse_backend_params(backend_param):
    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        backend_params[backend][key] = val
    return backend_params


def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


@cli.command('list-projects')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <8}"
    header = template.format("Project ID", "Project Name", "Language")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.project.get_projects(min_access=Access.private).values():
        click.echo(template.format(proj.project_id, proj.name, proj.language))


@cli.command('show-project')
@click.argument('project_id')
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    template = "{0:<20}{1}"
    click.echo(template.format('Project ID:', proj.project_id))
    click.echo(template.format('Project Name:', proj.name))
    click.echo(template.format('Language:', proj.language))
    click.echo(template.format('Access:', proj.access.name))


@cli.command('loadvoc')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(dir_okay=False))
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects)


@cli.command('train')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
def run_train(project_id, paths):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    proj.train(documents)


@cli.command('learn')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
def run_learn(project_id, paths):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    proj.learn(documents)


@cli.command('suggest')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits:
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))


@cli.command('index')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('directory', type=click.Path(file_okay=False))
@click.option(
    '--suffix',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results):
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
def run_eval(project_id, paths, limit, threshold, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))


@cli.command('optimize')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
def run_optimize(project_id, paths, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths)
    for doc in docs.documents:
        hits = project.suggest(doc.text, backend_params)
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        results = filter_batch[1].results(metrics='simple')
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in ('Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)',
                   'NDCG@5',
                   'NDCG@10'):
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


if __name__ == '__main__':
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import os.path
7			import re
8			import sys
9			import click
10			import click_log
11			from flask.cli import FlaskGroup
12			import annif
13			import annif.corpus
14			import annif.eval
15			import annif.project
16			from annif.project import Access
17			from annif.suggestion import SuggestionFilter
18
19			logger = annif.logger
20			click_log.basic_config(logger)
21
22			cli = FlaskGroup(create_app=annif.create_app)
23
24
25			def get_project(project_id):
26			"""
27			Helper function to get a project by ID and bail out if it doesn't exist"""
28			try:
29			return annif.project.get_project(project_id, min_access=Access.hidden)
30			except ValueError:
31			click.echo(
32			"No projects found with id \'{0}\'.".format(project_id),
33			err=True)
34			sys.exit(1)
35
36
37			def open_documents(paths):
38			"""Helper function to open a document corpus from a list of pathnames,
39			each of which is either a TSV file or a directory of TXT files. The
40			corpus will be returned as an instance of DocumentCorpus."""
41
42			def open_doc_path(path):
43			"""open a single path and return it as a DocumentCorpus"""
44			if os.path.isdir(path):
45			return annif.corpus.DocumentDirectory(path, require_subjects=True)
46			return annif.corpus.DocumentFile(path)
47
48			if len(paths) > 1:
49			corpora = [open_doc_path(path) for path in paths]
50			docs = annif.corpus.CombinedCorpus(corpora)
51			else:
52			docs = open_doc_path(paths[0])
53			return docs
54
55
56			def parse_backend_params(backend_param):
57			"""Parse a list of backend parameters given with the --backend-param
58			option into a nested dict structure"""
59			backend_params = collections.defaultdict(dict)
60			for beparam in backend_param:
61			backend, param = beparam.split('.', 1)
62			key, val = param.split('=', 1)
63			backend_params[backend][key] = val
64			return backend_params
65
66
67			def generate_filter_batches(subjects):
68			filter_batches = collections.OrderedDict()
69			for limit in range(1, 16):
70			for threshold in [i * 0.05 for i in range(20)]:
71			hit_filter = SuggestionFilter(limit, threshold)
72			batch = annif.eval.EvaluationBatch(subjects)
73			filter_batches[(limit, threshold)] = (hit_filter, batch)
74			return filter_batches
75
76
77			@cli.command('list-projects')
78			def run_list_projects():
79			"""
80			List available projects.
81			"""
82
83			template = "{0: <25}{1: <45}{2: <8}"
84			header = template.format("Project ID", "Project Name", "Language")
85			click.echo(header)
86			click.echo("-" * len(header))
87			for proj in annif.project.get_projects(min_access=Access.private).values():
88			click.echo(template.format(proj.project_id, proj.name, proj.language))
89
90
91			@cli.command('show-project')
92			@click.argument('project_id')
93			def run_show_project(project_id):
94			"""
95			Show information about a project.
96			"""
97
98			proj = get_project(project_id)
99			template = "{0:<20}{1}"
100			click.echo(template.format('Project ID:', proj.project_id))
101			click.echo(template.format('Project Name:', proj.name))
102			click.echo(template.format('Language:', proj.language))
103			click.echo(template.format('Access:', proj.access.name))
104
105
106			@cli.command('loadvoc')
107			@click_log.simple_verbosity_option(logger)
108			@click.argument('project_id')
109			@click.argument('subjectfile', type=click.Path(dir_okay=False))
110			def run_loadvoc(project_id, subjectfile):
111			"""
112			Load a vocabulary for a project.
113			"""
114			proj = get_project(project_id)
115			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
116			# SKOS/RDF file supported by rdflib
117			subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
118			else:
119			# probably a TSV file
120			subjects = annif.corpus.SubjectFileTSV(subjectfile)
121			proj.vocab.load_vocabulary(subjects)
122
123
124			@cli.command('train')
125			@click_log.simple_verbosity_option(logger)
126			@click.argument('project_id')
127			@click.argument('paths', type=click.Path(), nargs=-1)
128			def run_train(project_id, paths):
129			"""
130			Train a project on a collection of documents.
131			"""
132			proj = get_project(project_id)
133			documents = open_documents(paths)
134			proj.train(documents)
135
136
137			@cli.command('learn')
138			@click_log.simple_verbosity_option(logger)
139			@click.argument('project_id')
140			@click.argument('paths', type=click.Path(), nargs=-1)
141			def run_learn(project_id, paths):
142			"""
143			Further train an existing project on a collection of documents.
144			"""
145			proj = get_project(project_id)
146			documents = open_documents(paths)
147			proj.learn(documents)
148
149
150			@cli.command('suggest')
151			@click_log.simple_verbosity_option(logger)
152			@click.argument('project_id')
153			@click.option('--limit', default=10, help='Maximum number of subjects')
154			@click.option('--threshold', default=0.0, help='Minimum score threshold')
155			@click.option('--backend-param', '-b', multiple=True,
156			help='Backend parameters to override')
157			def run_suggest(project_id, limit, threshold, backend_param):
158			"""
159			Suggest subjects for a single document from standard input.
160			"""
161			project = get_project(project_id)
162			text = sys.stdin.read()
163			backend_params = parse_backend_params(backend_param)
164			hit_filter = SuggestionFilter(limit, threshold)
165			hits = hit_filter(project.suggest(text, backend_params))
166			for hit in hits:
167			click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
168
169
170			@cli.command('index')
171			@click_log.simple_verbosity_option(logger)
172			@click.argument('project_id')
173			@click.argument('directory', type=click.Path(file_okay=False))
174			@click.option(
175			'--suffix',
176			default='.annif',
177			help='File name suffix for result files')
178			@click.option('--force/--no-force', default=False,
179			help='Force overwriting of existing result files')
180			@click.option('--limit', default=10, help='Maximum number of subjects')
181			@click.option('--threshold', default=0.0, help='Minimum score threshold')
182			@click.option('--backend-param', '-b', multiple=True,
183			help='Backend parameters to override')
184			def run_index(project_id, directory, suffix, force,
185			limit, threshold, backend_param):
186			"""
187			Index a directory with documents, suggesting subjects for each document.
188			Write the results in TSV files with the given suffix.
189			"""
190			project = get_project(project_id)
191			backend_params = parse_backend_params(backend_param)
192			hit_filter = SuggestionFilter(limit, threshold)
193
194			for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
195			directory, require_subjects=False):
196			with open(docfilename, encoding='utf-8') as docfile:
197			text = docfile.read()
198			subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
199			if os.path.exists(subjectfilename) and not force:
200			click.echo(
201			"Not overwriting {} (use --force to override)".format(
202			subjectfilename))
203			continue
204			with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
205			results = project.suggest(text, backend_params)
206			for hit in hit_filter(results):
207			line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
208			click.echo(line, file=subjfile)
209
210
211			@cli.command('eval')
212			@click_log.simple_verbosity_option(logger)
213			@click.argument('project_id')
214			@click.argument('paths', type=click.Path(), nargs=-1)
215			@click.option('--limit', default=10, help='Maximum number of subjects')
216			@click.option('--threshold', default=0.0, help='Minimum score threshold')
217			@click.option('--backend-param', '-b', multiple=True,
218			help='Backend parameters to override')
219			def run_eval(project_id, paths, limit, threshold, backend_param):
220			"""
221			Analyze documents and evaluate the result.
222
223			Compare the results of automated indexing against a gold standard. The
224			path may be either a TSV file with short documents or a directory with
225			documents in separate files.
226			"""
227			project = get_project(project_id)
228			backend_params = parse_backend_params(backend_param)
229
230			hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
231			eval_batch = annif.eval.EvaluationBatch(project.subjects)
232
233			docs = open_documents(paths)
234			for doc in docs.documents:
235			results = project.suggest(doc.text, backend_params)
236			hits = hit_filter(results)
237			eval_batch.evaluate(hits,
238			annif.corpus.SubjectSet((doc.uris, doc.labels)))
239
240			template = "{0:<20}\t{1}"
241			for metric, score in eval_batch.results().items():
242			click.echo(template.format(metric + ":", score))
243
244
245			@cli.command('optimize')
246			@click_log.simple_verbosity_option(logger)
247			@click.argument('project_id')
248			@click.argument('paths', type=click.Path(), nargs=-1)
249			@click.option('--backend-param', '-b', multiple=True,
250			help='Backend parameters to override')
251			def run_optimize(project_id, paths, backend_param):
252			"""
253			Analyze documents, testing multiple limits and thresholds.
254
255			Evaluate the analysis results for a directory with documents against a
256			gold standard given in subject files. Test different limit/threshold
257			values and report the precision, recall and F-measure of each combination
258			of settings.
259			"""
260			project = get_project(project_id)
261			backend_params = parse_backend_params(backend_param)
262
263			filter_batches = generate_filter_batches(project.subjects)
264
265			ndocs = 0
266			docs = open_documents(paths)
267			for doc in docs.documents:
268			hits = project.suggest(doc.text, backend_params)
269			gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
270			for hit_filter, batch in filter_batches.values():
271			batch.evaluate(hit_filter(hits), gold_subjects)
272			ndocs += 1
273
274			click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
275
276			best_scores = collections.defaultdict(float)
277			best_params = {}
278
279			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
280			# Store the batches in a list that gets consumed along the way
281			# This way GC will have a chance to reclaim the memory
282			filter_batches = list(filter_batches.items())
283			while filter_batches:
284			params, filter_batch = filter_batches.pop(0)
285			results = filter_batch[1].results(metrics='simple')
286			for metric, score in results.items():
287			if score >= best_scores[metric]:
288			best_scores[metric] = score
289			best_params[metric] = params
290			click.echo(
291			template.format(
292			params[0],
293			params[1],
294			results['Precision (doc avg)'],
295			results['Recall (doc avg)'],
296			results['F1 score (doc avg)']))
297
298			click.echo()
299			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
300			for metric in ('Precision (doc avg)',
301			'Recall (doc avg)',
302			'F1 score (doc avg)',
303			'NDCG@5',
304			'NDCG@10'):
305			click.echo(
306			template2.format(
307			metric,
308			best_scores[metric],
309			best_params[metric][0],
310			best_params[metric][1]))
311			click.echo("Documents evaluated:\t{}".format(ndocs))
312
313
314			if __name__ == '__main__':
315			cli()
316

NatLibFi / Annif

Push — master ( c8c370...dee89b )

annif.cli.run_analyzedir() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like