annif.cli.run_learn() - Code Metrics - Inspection of "Implement hyperparameter optimization of ensemble..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#414)

by Osma

created 2020-07-21 12:25 UTC

annif.cli.run_learn() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	13
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	10
dl	0
loc	13
rs	9.9
c	0
b	0
f	0
cc	1
nop	3

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.eval
import annif.parallel
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.registry.get_project(project_id, min_access=Access.hidden)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths):
    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    return docs


def parse_backend_params(backend_param, project):
    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if 'algorithm' in beparam:
        raise NotSupportedException('Algorithm overriding not supported.')
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(subjects, limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')
@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format(
        "Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(
            min_access=Access.private).values():
        click.echo(template.format(
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f'Project ID:        {proj.project_id}')
    click.echo(f'Project Name:      {proj.name}')
    click.echo(f'Language:          {proj.language}')
    click.echo(f'Access:            {proj.access.name}')
    click.echo(f'Trained:           {proj.is_trained}')
    click.echo(f'Modification time: {proj.modification_time}')


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', default=False,
              help='Reuse preprocessed training data from previous run')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths)
    proj.train(documents, backend_params)


@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_learn(project_id, paths, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths)
    proj.learn(documents, backend_params)


@cli.command('suggest')
@click.argument('project_id')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits.as_list(project.subjects):
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option(
    '--results-file',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""")
@click.option('--jobs',
              default=1,
              help='Number of parallel jobs (0 means all CPUs)')
@backend_param_option
@common_options
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths)

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize()
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits[project_id],
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    for metric, score in eval_batch.results(results_file=results_file).items():
        click.echo(template.format(metric + ":", score))


@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_optimize(project_id, paths, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths)
    for doc in docs.documents:
        hits = project.suggest(doc.text, backend_params)
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ['Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)',
                   'NDCG@5',
                   'NDCG@10']
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--trials', default=10, help='Number of trials')
@click.option('--jobs',
              default=1,
              help='Number of parallel runs (-1 means all CPUs)')
@click.option('--metric', default='NDCG', help='Metric to optimize')
@click.option(
    '--results-file',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""")
@common_options
def run_hyperopt(project_id, paths, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == '__main__':
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import os.path
7			import re
8			import sys
9			import click
10			import click_log
11			from flask import current_app
12			from flask.cli import FlaskGroup, ScriptInfo
13			import annif
14			import annif.corpus
15			import annif.eval
16			import annif.parallel
17			import annif.project
18			import annif.registry
19			from annif.project import Access
20			from annif.suggestion import SuggestionFilter
21			from annif.exception import ConfigurationException, NotSupportedException
22
23			logger = annif.logger
24			click_log.basic_config(logger)
25
26			cli = FlaskGroup(create_app=annif.create_app)
27
28
29			def get_project(project_id):
30			"""
31			Helper function to get a project by ID and bail out if it doesn't exist"""
32			try:
33			return annif.registry.get_project(project_id, min_access=Access.hidden)
34			except ValueError:
35			click.echo(
36			"No projects found with id \'{0}\'.".format(project_id),
37			err=True)
38			sys.exit(1)
39
40
41			def open_documents(paths):
42			"""Helper function to open a document corpus from a list of pathnames,
43			each of which is either a TSV file or a directory of TXT files. The
44			corpus will be returned as an instance of DocumentCorpus."""
45
46			def open_doc_path(path):
47			"""open a single path and return it as a DocumentCorpus"""
48			if os.path.isdir(path):
49			return annif.corpus.DocumentDirectory(path, require_subjects=True)
50			return annif.corpus.DocumentFile(path)
51
52			if len(paths) == 0:
53			logger.warning('Reading empty file')
54			docs = open_doc_path(os.path.devnull)
55			elif len(paths) == 1:
56			docs = open_doc_path(paths[0])
57			else:
58			corpora = [open_doc_path(path) for path in paths]
59			docs = annif.corpus.CombinedCorpus(corpora)
60			return docs
61
62
63			def parse_backend_params(backend_param, project):
64			"""Parse a list of backend parameters given with the --backend-param
65			option into a nested dict structure"""
66			backend_params = collections.defaultdict(dict)
67			for beparam in backend_param:
68			backend, param = beparam.split('.', 1)
69			key, val = param.split('=', 1)
70			validate_backend_params(backend, beparam, project)
71			backend_params[backend][key] = val
72			return backend_params
73
74
75			def validate_backend_params(backend, beparam, project):
76			if 'algorithm' in beparam:
77			raise NotSupportedException('Algorithm overriding not supported.')
78			if backend != project.config['backend']:
79			raise ConfigurationException(
80			'The backend {} in CLI option "-b {}" not matching the project'
81			' backend {}.'
82			.format(backend, beparam, project.config['backend']))
83
84
85			def generate_filter_batches(subjects):
86			filter_batches = collections.OrderedDict()
87			for limit in range(1, 16):
88			for threshold in [i * 0.05 for i in range(20)]:
89			hit_filter = SuggestionFilter(subjects, limit, threshold)
90			batch = annif.eval.EvaluationBatch(subjects)
91			filter_batches[(limit, threshold)] = (hit_filter, batch)
92			return filter_batches
93
94
95			def set_project_config_file_path(ctx, param, value):
96			"""Override the default path or the path given in env by CLI option"""
97			with ctx.ensure_object(ScriptInfo).load_app().app_context():
98			if value:
99			current_app.config['PROJECTS_FILE'] = value
100
101
102			def common_options(f):
103			"""Decorator to add common options for all CLI commands"""
104			f = click.option(
105			'-p', '--projects', help='Set path to projects.cfg',
106			type=click.Path(dir_okay=False, exists=True),
107			callback=set_project_config_file_path, expose_value=False,
108			is_eager=True)(f)
109			return click_log.simple_verbosity_option(logger)(f)
110
111
112			def backend_param_option(f):
113			"""Decorator to add an option for CLI commands to override BE parameters"""
114			return click.option(
115			'--backend-param', '-b', multiple=True,
116			help='Override backend parameter of the config file. ' +
117			'Syntax: "-b <backend>.<parameter>=<value>".')(f)
118
119
120			@cli.command('list-projects')
121			@common_options
122			@click_log.simple_verbosity_option(logger, default='ERROR')
123			def run_list_projects():
124			"""
125			List available projects.
126			"""
127
128			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
129			header = template.format(
130			"Project ID", "Project Name", "Language", "Trained")
131			click.echo(header)
132			click.echo("-" * len(header))
133			for proj in annif.registry.get_projects(
134			min_access=Access.private).values():
135			click.echo(template.format(
136			proj.project_id, proj.name, proj.language, str(proj.is_trained)))
137
138
139			@cli.command('show-project')
140			@click.argument('project_id')
141			@common_options
142			def run_show_project(project_id):
143			"""
144			Show information about a project.
145			"""
146
147			proj = get_project(project_id)
148			click.echo(f'Project ID: {proj.project_id}')
149			click.echo(f'Project Name: {proj.name}')
150			click.echo(f'Language: {proj.language}')
151			click.echo(f'Access: {proj.access.name}')
152			click.echo(f'Trained: {proj.is_trained}')
153			click.echo(f'Modification time: {proj.modification_time}')
154
155
156			@cli.command('clear')
157			@click.argument('project_id')
158			@common_options
159			def run_clear_project(project_id):
160			"""
161			Initialize the project to its original, untrained state.
162			"""
163			proj = get_project(project_id)
164			proj.remove_model_data()
165
166
167			@cli.command('loadvoc')
168			@click.argument('project_id')
169			@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
170			@common_options
171			def run_loadvoc(project_id, subjectfile):
172			"""
173			Load a vocabulary for a project.
174			"""
175			proj = get_project(project_id)
176			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
177			# SKOS/RDF file supported by rdflib
178			subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
179			else:
180			# probably a TSV file
181			subjects = annif.corpus.SubjectFileTSV(subjectfile)
182			proj.vocab.load_vocabulary(subjects, proj.language)
183
184
185			@cli.command('train')
186			@click.argument('project_id')
187			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
188			@click.option('--cached/--no-cached', default=False,
189			help='Reuse preprocessed training data from previous run')
190			@backend_param_option
191			@common_options
192			def run_train(project_id, paths, cached, backend_param):
193			"""
194			Train a project on a collection of documents.
195			"""
196			proj = get_project(project_id)
197			backend_params = parse_backend_params(backend_param, proj)
198			if cached:
199			if len(paths) > 0:
200			raise click.UsageError(
201			"Corpus paths cannot be given when using --cached option.")
202			documents = 'cached'
203			else:
204			documents = open_documents(paths)
205			proj.train(documents, backend_params)
206
207
208			@cli.command('learn')
209			@click.argument('project_id')
210			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
211			@backend_param_option
212			@common_options
213			def run_learn(project_id, paths, backend_param):
214			"""
215			Further train an existing project on a collection of documents.
216			"""
217			proj = get_project(project_id)
218			backend_params = parse_backend_params(backend_param, proj)
219			documents = open_documents(paths)
220			proj.learn(documents, backend_params)
221
222
223			@cli.command('suggest')
224			@click.argument('project_id')
225			@click.option('--limit', default=10, help='Maximum number of subjects')
226			@click.option('--threshold', default=0.0, help='Minimum score threshold')
227			@backend_param_option
228			@common_options
229			def run_suggest(project_id, limit, threshold, backend_param):
230			"""
231			Suggest subjects for a single document from standard input.
232			"""
233			project = get_project(project_id)
234			text = sys.stdin.read()
235			backend_params = parse_backend_params(backend_param, project)
236			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
237			hits = hit_filter(project.suggest(text, backend_params))
238			for hit in hits.as_list(project.subjects):
239			click.echo(
240			"<{}>\t{}\t{}".format(
241			hit.uri,
242			'\t'.join(filter(None, (hit.label, hit.notation))),
243			hit.score))
244
245
246			@cli.command('index')
247			@click.argument('project_id')
248			@click.argument('directory', type=click.Path(exists=True, file_okay=False))
249			@click.option(
250			'--suffix',
251			default='.annif',
252			help='File name suffix for result files')
253			@click.option('--force/--no-force', default=False,
254			help='Force overwriting of existing result files')
255			@click.option('--limit', default=10, help='Maximum number of subjects')
256			@click.option('--threshold', default=0.0, help='Minimum score threshold')
257			@backend_param_option
258			@common_options
259			def run_index(project_id, directory, suffix, force,
260			limit, threshold, backend_param):
261			"""
262			Index a directory with documents, suggesting subjects for each document.
263			Write the results in TSV files with the given suffix.
264			"""
265			project = get_project(project_id)
266			backend_params = parse_backend_params(backend_param, project)
267			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
268
269			for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
270			directory, require_subjects=False):
271			with open(docfilename, encoding='utf-8-sig') as docfile:
272			text = docfile.read()
273			subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
274			if os.path.exists(subjectfilename) and not force:
275			click.echo(
276			"Not overwriting {} (use --force to override)".format(
277			subjectfilename))
278			continue
279			with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
280			results = project.suggest(text, backend_params)
281			for hit in hit_filter(results).as_list(project.subjects):
282			line = "<{}>\t{}\t{}".format(
283			hit.uri,
284			'\t'.join(filter(None, (hit.label, hit.notation))),
285			hit.score)
286			click.echo(line, file=subjfile)
287
288
289			@cli.command('eval')
290			@click.argument('project_id')
291			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
292			@click.option('--limit', default=10, help='Maximum number of subjects')
293			@click.option('--threshold', default=0.0, help='Minimum score threshold')
294			@click.option(
295			'--results-file',
296			type=click.File(
297			'w',
298			encoding='utf-8',
299			errors='ignore',
300			lazy=True),
301			help="""Specify file in order to write non-aggregated results per subject.
302			File directory must exist, existing file will be overwritten.""")
303			@click.option('--jobs',
304			default=1,
305			help='Number of parallel jobs (0 means all CPUs)')
306			@backend_param_option
307			@common_options
308			def run_eval(
309			project_id,
310			paths,
311			limit,
312			threshold,
313			results_file,
314			jobs,
315			backend_param):
316			"""
317			Analyze documents and evaluate the result.
318
319			Compare the results of automated indexing against a gold standard. The
320			path may be either a TSV file with short documents or a directory with
321			documents in separate files.
322			"""
323
324			project = get_project(project_id)
325			backend_params = parse_backend_params(backend_param, project)
326
327			eval_batch = annif.eval.EvaluationBatch(project.subjects)
328
329			if results_file:
330			try:
331			print('', end='', file=results_file)
332			click.echo('Writing per subject evaluation results to {!s}'.format(
333			results_file.name))
334			except Exception as e:
335			raise NotSupportedException(
336			"cannot open results-file for writing: " + str(e))
337			docs = open_documents(paths)
338
339			jobs, pool_class = annif.parallel.get_pool(jobs)
340
341			project.initialize()
342			psmap = annif.parallel.ProjectSuggestMap(
343			project.registry, [project_id], backend_params, limit, threshold)
344
345			with pool_class(jobs) as pool:
346			for hits, uris, labels in pool.imap_unordered(
347			psmap.suggest, docs.documents):
348			eval_batch.evaluate(hits[project_id],
349			annif.corpus.SubjectSet((uris, labels)))
350
351			template = "{0:<30}\t{1}"
352			for metric, score in eval_batch.results(results_file=results_file).items():
353			click.echo(template.format(metric + ":", score))
354
355
356			@cli.command('optimize')
357			@click.argument('project_id')
358			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
359			@backend_param_option
360			@common_options
361			def run_optimize(project_id, paths, backend_param):
362			"""
363			Analyze documents, testing multiple limits and thresholds.
364
365			Evaluate the analysis results for a directory with documents against a
366			gold standard given in subject files. Test different limit/threshold
367			values and report the precision, recall and F-measure of each combination
368			of settings.
369			"""
370			project = get_project(project_id)
371			backend_params = parse_backend_params(backend_param, project)
372
373			filter_batches = generate_filter_batches(project.subjects)
374
375			ndocs = 0
376			docs = open_documents(paths)
377			for doc in docs.documents:
378			hits = project.suggest(doc.text, backend_params)
379			gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
380			for hit_filter, batch in filter_batches.values():
381			batch.evaluate(hit_filter(hits), gold_subjects)
382			ndocs += 1
383
384			click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
385
386			best_scores = collections.defaultdict(float)
387			best_params = {}
388
389			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
390			# Store the batches in a list that gets consumed along the way
391			# This way GC will have a chance to reclaim the memory
392			filter_batches = list(filter_batches.items())
393			while filter_batches:
394			params, filter_batch = filter_batches.pop(0)
395			metrics = ['Precision (doc avg)',
396			'Recall (doc avg)',
397			'F1 score (doc avg)',
398			'NDCG@5',
399			'NDCG@10']
400			results = filter_batch[1].results(metrics=metrics)
401			for metric, score in results.items():
402			if score >= best_scores[metric]:
403			best_scores[metric] = score
404			best_params[metric] = params
405			click.echo(
406			template.format(
407			params[0],
408			params[1],
409			results['Precision (doc avg)'],
410			results['Recall (doc avg)'],
411			results['F1 score (doc avg)']))
412
413			click.echo()
414			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
415			for metric in metrics:
416			click.echo(
417			template2.format(
418			metric,
419			best_scores[metric],
420			best_params[metric][0],
421			best_params[metric][1]))
422			click.echo("Documents evaluated:\t{}".format(ndocs))
423
424
425			@cli.command('hyperopt')
426			@click.argument('project_id')
427			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
428			@click.option('--trials', default=10, help='Number of trials')
429			@click.option('--jobs',
430			default=1,
431			help='Number of parallel runs (-1 means all CPUs)')
432			@click.option('--metric', default='NDCG', help='Metric to optimize')
433			@click.option(
434			'--results-file',
435			type=click.File(
436			'w',
437			encoding='utf-8',
438			errors='ignore',
439			lazy=True),
440			help="""Specify file path to write trial results as CSV.
441			File directory must exist, existing file will be overwritten.""")
442			@common_options
443			def run_hyperopt(project_id, paths, trials, jobs, metric, results_file):
444			"""
445			Optimize the hyperparameters of a project using a validation corpus.
446			"""
447			proj = get_project(project_id)
448			documents = open_documents(paths)
449			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
450			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
451			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
452			click.echo("---")
453			for line in rec.lines:
454			click.echo(line)
455			click.echo("---")
456
457
458			if __name__ == '__main__':
459			cli()
460

NatLibFi / Annif

Pull Request — master (#414)

annif.cli.run_learn() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like