annif.cli.run_clear_project() - Code Metrics - Inspection of "Merge pull request #306 from NatLibFi/issue251-CLI..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 93695b...d2dff1 )

by Osma

created 2019-08-23 08:21 UTC

annif.cli.run_clear_project() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	9
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	6
dl	0
loc	9
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.eval
import annif.project
from annif.project import Access
from annif.suggestion import SuggestionFilter

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.project.get_project(project_id, min_access=Access.hidden)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths):
    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) > 1:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    else:
        docs = open_doc_path(paths[0])
    return docs


def parse_backend_params(backend_param):
    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        backend_params[backend][key] = val
    return backend_params


def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    f = click_log.simple_verbosity_option(logger)(f)
    return f


@cli.command('list-projects')
@common_options
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <8}"
    header = template.format("Project ID", "Project Name", "Language")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.project.get_projects(min_access=Access.private).values():
        click.echo(template.format(proj.project_id, proj.name, proj.language))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    template = "{0:<20}{1}"
    click.echo(template.format('Project ID:', proj.project_id))
    click.echo(template.format('Project Name:', proj.name))
    click.echo(template.format('Language:', proj.language))
    click.echo(template.format('Access:', proj.access.name))


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects)


@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@common_options
def run_train(project_id, paths):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    proj.train(documents)


@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@common_options
def run_learn(project_id, paths):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    proj.learn(documents)


@cli.command('suggest')
@click.argument('project_id')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits:
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))


@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(file_okay=False))
@click.option(
    '--suffix',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results):
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
@common_options
def run_eval(project_id, paths, limit, threshold, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))


@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.option('--backend-param', '-b', multiple=True,
              help='Backend parameters to override')
@common_options
def run_optimize(project_id, paths, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths)
    for doc in docs.documents:
        hits = project.suggest(doc.text, backend_params)
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        results = filter_batch[1].results(metrics='simple')
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in ('Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)',
                   'NDCG@5',
                   'NDCG@10'):
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


if __name__ == '__main__':
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import os.path
7			import re
8			import sys
9			import click
10			import click_log
11			from flask import current_app
12			from flask.cli import FlaskGroup, ScriptInfo
13			import annif
14			import annif.corpus
15			import annif.eval
16			import annif.project
17			from annif.project import Access
18			from annif.suggestion import SuggestionFilter
19
20			logger = annif.logger
21			click_log.basic_config(logger)
22
23			cli = FlaskGroup(create_app=annif.create_app)
24
25
26			def get_project(project_id):
27			"""
28			Helper function to get a project by ID and bail out if it doesn't exist"""
29			try:
30			return annif.project.get_project(project_id, min_access=Access.hidden)
31			except ValueError:
32			click.echo(
33			"No projects found with id \'{0}\'.".format(project_id),
34			err=True)
35			sys.exit(1)
36
37
38			def open_documents(paths):
39			"""Helper function to open a document corpus from a list of pathnames,
40			each of which is either a TSV file or a directory of TXT files. The
41			corpus will be returned as an instance of DocumentCorpus."""
42
43			def open_doc_path(path):
44			"""open a single path and return it as a DocumentCorpus"""
45			if os.path.isdir(path):
46			return annif.corpus.DocumentDirectory(path, require_subjects=True)
47			return annif.corpus.DocumentFile(path)
48
49			if len(paths) > 1:
50			corpora = [open_doc_path(path) for path in paths]
51			docs = annif.corpus.CombinedCorpus(corpora)
52			else:
53			docs = open_doc_path(paths[0])
54			return docs
55
56
57			def parse_backend_params(backend_param):
58			"""Parse a list of backend parameters given with the --backend-param
59			option into a nested dict structure"""
60			backend_params = collections.defaultdict(dict)
61			for beparam in backend_param:
62			backend, param = beparam.split('.', 1)
63			key, val = param.split('=', 1)
64			backend_params[backend][key] = val
65			return backend_params
66
67
68			def generate_filter_batches(subjects):
69			filter_batches = collections.OrderedDict()
70			for limit in range(1, 16):
71			for threshold in [i * 0.05 for i in range(20)]:
72			hit_filter = SuggestionFilter(limit, threshold)
73			batch = annif.eval.EvaluationBatch(subjects)
74			filter_batches[(limit, threshold)] = (hit_filter, batch)
75			return filter_batches
76
77
78			def set_project_config_file_path(ctx, param, value):
79			"""Override the default path or the path given in env by CLI option"""
80			with ctx.ensure_object(ScriptInfo).load_app().app_context():
81			if value:
82			current_app.config['PROJECTS_FILE'] = value
83
84
85			def common_options(f):
86			"""Decorator to add common options for all CLI commands"""
87			f = click.option(
88			'-p', '--projects', help='Set path to projects.cfg',
89			callback=set_project_config_file_path, expose_value=False,
90			is_eager=True)(f)
91			f = click_log.simple_verbosity_option(logger)(f)
92			return f
93
94
95			@cli.command('list-projects')
96			@common_options
97			def run_list_projects():
98			"""
99			List available projects.
100			"""
101
102			template = "{0: <25}{1: <45}{2: <8}"
103			header = template.format("Project ID", "Project Name", "Language")
104			click.echo(header)
105			click.echo("-" * len(header))
106			for proj in annif.project.get_projects(min_access=Access.private).values():
107			click.echo(template.format(proj.project_id, proj.name, proj.language))
108
109
110			@cli.command('show-project')
111			@click.argument('project_id')
112			@common_options
113			def run_show_project(project_id):
114			"""
115			Show information about a project.
116			"""
117
118			proj = get_project(project_id)
119			template = "{0:<20}{1}"
120			click.echo(template.format('Project ID:', proj.project_id))
121			click.echo(template.format('Project Name:', proj.name))
122			click.echo(template.format('Language:', proj.language))
123			click.echo(template.format('Access:', proj.access.name))
124
125
126			@cli.command('clear')
127			@click.argument('project_id')
128			@common_options
129			def run_clear_project(project_id):
130			"""
131			Initialize the project to its original, untrained state.
132			"""
133			proj = get_project(project_id)
134			proj.remove_model_data()
135
136
137			@cli.command('loadvoc')
138			@click.argument('project_id')
139			@click.argument('subjectfile', type=click.Path(dir_okay=False))
140			@common_options
141			def run_loadvoc(project_id, subjectfile):
142			"""
143			Load a vocabulary for a project.
144			"""
145			proj = get_project(project_id)
146			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
147			# SKOS/RDF file supported by rdflib
148			subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
149			else:
150			# probably a TSV file
151			subjects = annif.corpus.SubjectFileTSV(subjectfile)
152			proj.vocab.load_vocabulary(subjects)
153
154
155			@cli.command('train')
156			@click.argument('project_id')
157			@click.argument('paths', type=click.Path(), nargs=-1)
158			@common_options
159			def run_train(project_id, paths):
160			"""
161			Train a project on a collection of documents.
162			"""
163			proj = get_project(project_id)
164			documents = open_documents(paths)
165			proj.train(documents)
166
167
168			@cli.command('learn')
169			@click.argument('project_id')
170			@click.argument('paths', type=click.Path(), nargs=-1)
171			@common_options
172			def run_learn(project_id, paths):
173			"""
174			Further train an existing project on a collection of documents.
175			"""
176			proj = get_project(project_id)
177			documents = open_documents(paths)
178			proj.learn(documents)
179
180
181			@cli.command('suggest')
182			@click.argument('project_id')
183			@click.option('--limit', default=10, help='Maximum number of subjects')
184			@click.option('--threshold', default=0.0, help='Minimum score threshold')
185			@click.option('--backend-param', '-b', multiple=True,
186			help='Backend parameters to override')
187			@common_options
188			def run_suggest(project_id, limit, threshold, backend_param):
189			"""
190			Suggest subjects for a single document from standard input.
191			"""
192			project = get_project(project_id)
193			text = sys.stdin.read()
194			backend_params = parse_backend_params(backend_param)
195			hit_filter = SuggestionFilter(limit, threshold)
196			hits = hit_filter(project.suggest(text, backend_params))
197			for hit in hits:
198			click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
199
200
201			@cli.command('index')
202			@click.argument('project_id')
203			@click.argument('directory', type=click.Path(file_okay=False))
204			@click.option(
205			'--suffix',
206			default='.annif',
207			help='File name suffix for result files')
208			@click.option('--force/--no-force', default=False,
209			help='Force overwriting of existing result files')
210			@click.option('--limit', default=10, help='Maximum number of subjects')
211			@click.option('--threshold', default=0.0, help='Minimum score threshold')
212			@click.option('--backend-param', '-b', multiple=True,
213			help='Backend parameters to override')
214			@common_options
215			def run_index(project_id, directory, suffix, force,
216			limit, threshold, backend_param):
217			"""
218			Index a directory with documents, suggesting subjects for each document.
219			Write the results in TSV files with the given suffix.
220			"""
221			project = get_project(project_id)
222			backend_params = parse_backend_params(backend_param)
223			hit_filter = SuggestionFilter(limit, threshold)
224
225			for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
226			directory, require_subjects=False):
227			with open(docfilename, encoding='utf-8') as docfile:
228			text = docfile.read()
229			subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
230			if os.path.exists(subjectfilename) and not force:
231			click.echo(
232			"Not overwriting {} (use --force to override)".format(
233			subjectfilename))
234			continue
235			with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
236			results = project.suggest(text, backend_params)
237			for hit in hit_filter(results):
238			line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
239			click.echo(line, file=subjfile)
240
241
242			@cli.command('eval')
243			@click.argument('project_id')
244			@click.argument('paths', type=click.Path(), nargs=-1)
245			@click.option('--limit', default=10, help='Maximum number of subjects')
246			@click.option('--threshold', default=0.0, help='Minimum score threshold')
247			@click.option('--backend-param', '-b', multiple=True,
248			help='Backend parameters to override')
249			@common_options
250			def run_eval(project_id, paths, limit, threshold, backend_param):
251			"""
252			Analyze documents and evaluate the result.
253
254			Compare the results of automated indexing against a gold standard. The
255			path may be either a TSV file with short documents or a directory with
256			documents in separate files.
257			"""
258			project = get_project(project_id)
259			backend_params = parse_backend_params(backend_param)
260
261			hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
262			eval_batch = annif.eval.EvaluationBatch(project.subjects)
263
264			docs = open_documents(paths)
265			for doc in docs.documents:
266			results = project.suggest(doc.text, backend_params)
267			hits = hit_filter(results)
268			eval_batch.evaluate(hits,
269			annif.corpus.SubjectSet((doc.uris, doc.labels)))
270
271			template = "{0:<20}\t{1}"
272			for metric, score in eval_batch.results().items():
273			click.echo(template.format(metric + ":", score))
274
275
276			@cli.command('optimize')
277			@click.argument('project_id')
278			@click.argument('paths', type=click.Path(), nargs=-1)
279			@click.option('--backend-param', '-b', multiple=True,
280			help='Backend parameters to override')
281			@common_options
282			def run_optimize(project_id, paths, backend_param):
283			"""
284			Analyze documents, testing multiple limits and thresholds.
285
286			Evaluate the analysis results for a directory with documents against a
287			gold standard given in subject files. Test different limit/threshold
288			values and report the precision, recall and F-measure of each combination
289			of settings.
290			"""
291			project = get_project(project_id)
292			backend_params = parse_backend_params(backend_param)
293
294			filter_batches = generate_filter_batches(project.subjects)
295
296			ndocs = 0
297			docs = open_documents(paths)
298			for doc in docs.documents:
299			hits = project.suggest(doc.text, backend_params)
300			gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
301			for hit_filter, batch in filter_batches.values():
302			batch.evaluate(hit_filter(hits), gold_subjects)
303			ndocs += 1
304
305			click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
306
307			best_scores = collections.defaultdict(float)
308			best_params = {}
309
310			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
311			# Store the batches in a list that gets consumed along the way
312			# This way GC will have a chance to reclaim the memory
313			filter_batches = list(filter_batches.items())
314			while filter_batches:
315			params, filter_batch = filter_batches.pop(0)
316			results = filter_batch[1].results(metrics='simple')
317			for metric, score in results.items():
318			if score >= best_scores[metric]:
319			best_scores[metric] = score
320			best_params[metric] = params
321			click.echo(
322			template.format(
323			params[0],
324			params[1],
325			results['Precision (doc avg)'],
326			results['Recall (doc avg)'],
327			results['F1 score (doc avg)']))
328
329			click.echo()
330			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
331			for metric in ('Precision (doc avg)',
332			'Recall (doc avg)',
333			'F1 score (doc avg)',
334			'NDCG@5',
335			'NDCG@10'):
336			click.echo(
337			template2.format(
338			metric,
339			best_scores[metric],
340			best_params[metric][0],
341			best_params[metric][1]))
342			click.echo("Documents evaluated:\t{}".format(ndocs))
343
344
345			if __name__ == '__main__':
346			cli()
347

NatLibFi / Annif

Push — master ( 93695b...d2dff1 )

annif.cli.run_clear_project() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like