annif.cli.run_hyperopt() - Code Metrics - Inspection of "Implement hyperparameter optimization of ensemble..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#414)

by Osma

created 2020-07-19 22:26 UTC

annif.cli.run_hyperopt() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	22
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	19
dl	0
loc	22
rs	9.45
c	0
b	0
f	0
cc	2
nop	5

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import multiprocessing
import multiprocessing.dummy
import os.path
import re
import sys
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.eval
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.registry.get_project(project_id, min_access=Access.hidden)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths):
    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    return docs


def parse_backend_params(backend_param, project):
    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if 'algorithm' in beparam:
        raise NotSupportedException('Algorithm overriding not supported.')
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(subjects, limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')
@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format(
        "Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(
            min_access=Access.private).values():
        click.echo(template.format(
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f'Project ID:        {proj.project_id}')
    click.echo(f'Project Name:      {proj.name}')
    click.echo(f'Language:          {proj.language}')
    click.echo(f'Access:            {proj.access.name}')
    click.echo(f'Trained:           {proj.is_trained}')
    click.echo(f'Modification time: {proj.modification_time}')


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', default=False,
              help='Reuse preprocessed training data from previous run')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths)
    proj.train(documents, backend_params)


@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_learn(project_id, paths, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths)
    proj.learn(documents, backend_params)


@cli.command('suggest')
@click.argument('project_id')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits.as_list(project.subjects):
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option(
    '--results-file',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""")
@click.option('--jobs',
              default=1,
              help='Number of parallel jobs (0 means all CPUs)')
@backend_param_option
@common_options
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths)

    if jobs < 1:
        jobs = None
        pool_class = multiprocessing.Pool
    elif jobs == 1:
        # use the dummy wrapper around threading to avoid subprocess overhead
        pool_class = multiprocessing.dummy.Pool
    else:
        pool_class = multiprocessing.Pool

    project.initialize()
    psmap = annif.project.ProjectSuggestMap(
        project, backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits,
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    for metric, score in eval_batch.results(results_file=results_file).items():
        click.echo(template.format(metric + ":", score))


@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_optimize(project_id, paths, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths)
    for doc in docs.documents:
        hits = project.suggest(doc.text, backend_params)
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ['Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)',
                   'NDCG@5',
                   'NDCG@10']
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--trials', default=10, help='Number of trials')
@click.option('--jobs',
              default=1,
              help='Number of parallel runs (-1 means all CPUs)')
@click.option('--metric', default='NDCG', help='Metric to optimize')
@common_options
def run_hyperopt(project_id, paths, trials, jobs, metric):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == '__main__':
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import multiprocessing
7			import multiprocessing.dummy
8			import os.path
9			import re
10			import sys
11			import click
12			import click_log
13			from flask import current_app
14			from flask.cli import FlaskGroup, ScriptInfo
15			import annif
16			import annif.corpus
17			import annif.eval
18			import annif.project
19			import annif.registry
20			from annif.project import Access
21			from annif.suggestion import SuggestionFilter
22			from annif.exception import ConfigurationException, NotSupportedException
23
24			logger = annif.logger
25			click_log.basic_config(logger)
26
27			cli = FlaskGroup(create_app=annif.create_app)
28
29
30			def get_project(project_id):
31			"""
32			Helper function to get a project by ID and bail out if it doesn't exist"""
33			try:
34			return annif.registry.get_project(project_id, min_access=Access.hidden)
35			except ValueError:
36			click.echo(
37			"No projects found with id \'{0}\'.".format(project_id),
38			err=True)
39			sys.exit(1)
40
41
42			def open_documents(paths):
43			"""Helper function to open a document corpus from a list of pathnames,
44			each of which is either a TSV file or a directory of TXT files. The
45			corpus will be returned as an instance of DocumentCorpus."""
46
47			def open_doc_path(path):
48			"""open a single path and return it as a DocumentCorpus"""
49			if os.path.isdir(path):
50			return annif.corpus.DocumentDirectory(path, require_subjects=True)
51			return annif.corpus.DocumentFile(path)
52
53			if len(paths) == 0:
54			logger.warning('Reading empty file')
55			docs = open_doc_path(os.path.devnull)
56			elif len(paths) == 1:
57			docs = open_doc_path(paths[0])
58			else:
59			corpora = [open_doc_path(path) for path in paths]
60			docs = annif.corpus.CombinedCorpus(corpora)
61			return docs
62
63
64			def parse_backend_params(backend_param, project):
65			"""Parse a list of backend parameters given with the --backend-param
66			option into a nested dict structure"""
67			backend_params = collections.defaultdict(dict)
68			for beparam in backend_param:
69			backend, param = beparam.split('.', 1)
70			key, val = param.split('=', 1)
71			validate_backend_params(backend, beparam, project)
72			backend_params[backend][key] = val
73			return backend_params
74
75
76			def validate_backend_params(backend, beparam, project):
77			if 'algorithm' in beparam:
78			raise NotSupportedException('Algorithm overriding not supported.')
79			if backend != project.config['backend']:
80			raise ConfigurationException(
81			'The backend {} in CLI option "-b {}" not matching the project'
82			' backend {}.'
83			.format(backend, beparam, project.config['backend']))
84
85
86			def generate_filter_batches(subjects):
87			filter_batches = collections.OrderedDict()
88			for limit in range(1, 16):
89			for threshold in [i * 0.05 for i in range(20)]:
90			hit_filter = SuggestionFilter(subjects, limit, threshold)
91			batch = annif.eval.EvaluationBatch(subjects)
92			filter_batches[(limit, threshold)] = (hit_filter, batch)
93			return filter_batches
94
95
96			def set_project_config_file_path(ctx, param, value):
97			"""Override the default path or the path given in env by CLI option"""
98			with ctx.ensure_object(ScriptInfo).load_app().app_context():
99			if value:
100			current_app.config['PROJECTS_FILE'] = value
101
102
103			def common_options(f):
104			"""Decorator to add common options for all CLI commands"""
105			f = click.option(
106			'-p', '--projects', help='Set path to projects.cfg',
107			type=click.Path(dir_okay=False, exists=True),
108			callback=set_project_config_file_path, expose_value=False,
109			is_eager=True)(f)
110			return click_log.simple_verbosity_option(logger)(f)
111
112
113			def backend_param_option(f):
114			"""Decorator to add an option for CLI commands to override BE parameters"""
115			return click.option(
116			'--backend-param', '-b', multiple=True,
117			help='Override backend parameter of the config file. ' +
118			'Syntax: "-b <backend>.<parameter>=<value>".')(f)
119
120
121			@cli.command('list-projects')
122			@common_options
123			@click_log.simple_verbosity_option(logger, default='ERROR')
124			def run_list_projects():
125			"""
126			List available projects.
127			"""
128
129			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
130			header = template.format(
131			"Project ID", "Project Name", "Language", "Trained")
132			click.echo(header)
133			click.echo("-" * len(header))
134			for proj in annif.registry.get_projects(
135			min_access=Access.private).values():
136			click.echo(template.format(
137			proj.project_id, proj.name, proj.language, str(proj.is_trained)))
138
139
140			@cli.command('show-project')
141			@click.argument('project_id')
142			@common_options
143			def run_show_project(project_id):
144			"""
145			Show information about a project.
146			"""
147
148			proj = get_project(project_id)
149			click.echo(f'Project ID: {proj.project_id}')
150			click.echo(f'Project Name: {proj.name}')
151			click.echo(f'Language: {proj.language}')
152			click.echo(f'Access: {proj.access.name}')
153			click.echo(f'Trained: {proj.is_trained}')
154			click.echo(f'Modification time: {proj.modification_time}')
155
156
157			@cli.command('clear')
158			@click.argument('project_id')
159			@common_options
160			def run_clear_project(project_id):
161			"""
162			Initialize the project to its original, untrained state.
163			"""
164			proj = get_project(project_id)
165			proj.remove_model_data()
166
167
168			@cli.command('loadvoc')
169			@click.argument('project_id')
170			@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
171			@common_options
172			def run_loadvoc(project_id, subjectfile):
173			"""
174			Load a vocabulary for a project.
175			"""
176			proj = get_project(project_id)
177			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
178			# SKOS/RDF file supported by rdflib
179			subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
180			else:
181			# probably a TSV file
182			subjects = annif.corpus.SubjectFileTSV(subjectfile)
183			proj.vocab.load_vocabulary(subjects, proj.language)
184
185
186			@cli.command('train')
187			@click.argument('project_id')
188			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
189			@click.option('--cached/--no-cached', default=False,
190			help='Reuse preprocessed training data from previous run')
191			@backend_param_option
192			@common_options
193			def run_train(project_id, paths, cached, backend_param):
194			"""
195			Train a project on a collection of documents.
196			"""
197			proj = get_project(project_id)
198			backend_params = parse_backend_params(backend_param, proj)
199			if cached:
200			if len(paths) > 0:
201			raise click.UsageError(
202			"Corpus paths cannot be given when using --cached option.")
203			documents = 'cached'
204			else:
205			documents = open_documents(paths)
206			proj.train(documents, backend_params)
207
208
209			@cli.command('learn')
210			@click.argument('project_id')
211			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
212			@backend_param_option
213			@common_options
214			def run_learn(project_id, paths, backend_param):
215			"""
216			Further train an existing project on a collection of documents.
217			"""
218			proj = get_project(project_id)
219			backend_params = parse_backend_params(backend_param, proj)
220			documents = open_documents(paths)
221			proj.learn(documents, backend_params)
222
223
224			@cli.command('suggest')
225			@click.argument('project_id')
226			@click.option('--limit', default=10, help='Maximum number of subjects')
227			@click.option('--threshold', default=0.0, help='Minimum score threshold')
228			@backend_param_option
229			@common_options
230			def run_suggest(project_id, limit, threshold, backend_param):
231			"""
232			Suggest subjects for a single document from standard input.
233			"""
234			project = get_project(project_id)
235			text = sys.stdin.read()
236			backend_params = parse_backend_params(backend_param, project)
237			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
238			hits = hit_filter(project.suggest(text, backend_params))
239			for hit in hits.as_list(project.subjects):
240			click.echo(
241			"<{}>\t{}\t{}".format(
242			hit.uri,
243			'\t'.join(filter(None, (hit.label, hit.notation))),
244			hit.score))
245
246
247			@cli.command('index')
248			@click.argument('project_id')
249			@click.argument('directory', type=click.Path(exists=True, file_okay=False))
250			@click.option(
251			'--suffix',
252			default='.annif',
253			help='File name suffix for result files')
254			@click.option('--force/--no-force', default=False,
255			help='Force overwriting of existing result files')
256			@click.option('--limit', default=10, help='Maximum number of subjects')
257			@click.option('--threshold', default=0.0, help='Minimum score threshold')
258			@backend_param_option
259			@common_options
260			def run_index(project_id, directory, suffix, force,
261			limit, threshold, backend_param):
262			"""
263			Index a directory with documents, suggesting subjects for each document.
264			Write the results in TSV files with the given suffix.
265			"""
266			project = get_project(project_id)
267			backend_params = parse_backend_params(backend_param, project)
268			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
269
270			for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
271			directory, require_subjects=False):
272			with open(docfilename, encoding='utf-8-sig') as docfile:
273			text = docfile.read()
274			subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
275			if os.path.exists(subjectfilename) and not force:
276			click.echo(
277			"Not overwriting {} (use --force to override)".format(
278			subjectfilename))
279			continue
280			with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
281			results = project.suggest(text, backend_params)
282			for hit in hit_filter(results).as_list(project.subjects):
283			line = "<{}>\t{}\t{}".format(
284			hit.uri,
285			'\t'.join(filter(None, (hit.label, hit.notation))),
286			hit.score)
287			click.echo(line, file=subjfile)
288
289
290			@cli.command('eval')
291			@click.argument('project_id')
292			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
293			@click.option('--limit', default=10, help='Maximum number of subjects')
294			@click.option('--threshold', default=0.0, help='Minimum score threshold')
295			@click.option(
296			'--results-file',
297			type=click.File(
298			'w',
299			encoding='utf-8',
300			errors='ignore',
301			lazy=True),
302			help="""Specify file in order to write non-aggregated results per subject.
303			File directory must exist, existing file will be overwritten.""")
304			@click.option('--jobs',
305			default=1,
306			help='Number of parallel jobs (0 means all CPUs)')
307			@backend_param_option
308			@common_options
309			def run_eval(
310			project_id,
311			paths,
312			limit,
313			threshold,
314			results_file,
315			jobs,
316			backend_param):
317			"""
318			Analyze documents and evaluate the result.
319
320			Compare the results of automated indexing against a gold standard. The
321			path may be either a TSV file with short documents or a directory with
322			documents in separate files.
323			"""
324
325			project = get_project(project_id)
326			backend_params = parse_backend_params(backend_param, project)
327
328			eval_batch = annif.eval.EvaluationBatch(project.subjects)
329
330			if results_file:
331			try:
332			print('', end='', file=results_file)
333			click.echo('Writing per subject evaluation results to {!s}'.format(
334			results_file.name))
335			except Exception as e:
336			raise NotSupportedException(
337			"cannot open results-file for writing: " + str(e))
338			docs = open_documents(paths)
339
340			if jobs < 1:
341			jobs = None
342			pool_class = multiprocessing.Pool
343			elif jobs == 1:
344			# use the dummy wrapper around threading to avoid subprocess overhead
345			pool_class = multiprocessing.dummy.Pool
346			else:
347			pool_class = multiprocessing.Pool
348
349			project.initialize()
350			psmap = annif.project.ProjectSuggestMap(
351			project, backend_params, limit, threshold)
352
353			with pool_class(jobs) as pool:
354			for hits, uris, labels in pool.imap_unordered(
355			psmap.suggest, docs.documents):
356			eval_batch.evaluate(hits,
357			annif.corpus.SubjectSet((uris, labels)))
358
359			template = "{0:<30}\t{1}"
360			for metric, score in eval_batch.results(results_file=results_file).items():
361			click.echo(template.format(metric + ":", score))
362
363
364			@cli.command('optimize')
365			@click.argument('project_id')
366			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
367			@backend_param_option
368			@common_options
369			def run_optimize(project_id, paths, backend_param):
370			"""
371			Analyze documents, testing multiple limits and thresholds.
372
373			Evaluate the analysis results for a directory with documents against a
374			gold standard given in subject files. Test different limit/threshold
375			values and report the precision, recall and F-measure of each combination
376			of settings.
377			"""
378			project = get_project(project_id)
379			backend_params = parse_backend_params(backend_param, project)
380
381			filter_batches = generate_filter_batches(project.subjects)
382
383			ndocs = 0
384			docs = open_documents(paths)
385			for doc in docs.documents:
386			hits = project.suggest(doc.text, backend_params)
387			gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
388			for hit_filter, batch in filter_batches.values():
389			batch.evaluate(hit_filter(hits), gold_subjects)
390			ndocs += 1
391
392			click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
393
394			best_scores = collections.defaultdict(float)
395			best_params = {}
396
397			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
398			# Store the batches in a list that gets consumed along the way
399			# This way GC will have a chance to reclaim the memory
400			filter_batches = list(filter_batches.items())
401			while filter_batches:
402			params, filter_batch = filter_batches.pop(0)
403			metrics = ['Precision (doc avg)',
404			'Recall (doc avg)',
405			'F1 score (doc avg)',
406			'NDCG@5',
407			'NDCG@10']
408			results = filter_batch[1].results(metrics=metrics)
409			for metric, score in results.items():
410			if score >= best_scores[metric]:
411			best_scores[metric] = score
412			best_params[metric] = params
413			click.echo(
414			template.format(
415			params[0],
416			params[1],
417			results['Precision (doc avg)'],
418			results['Recall (doc avg)'],
419			results['F1 score (doc avg)']))
420
421			click.echo()
422			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
423			for metric in metrics:
424			click.echo(
425			template2.format(
426			metric,
427			best_scores[metric],
428			best_params[metric][0],
429			best_params[metric][1]))
430			click.echo("Documents evaluated:\t{}".format(ndocs))
431
432
433			@cli.command('hyperopt')
434			@click.argument('project_id')
435			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
436			@click.option('--trials', default=10, help='Number of trials')
437			@click.option('--jobs',
438			default=1,
439			help='Number of parallel runs (-1 means all CPUs)')
440			@click.option('--metric', default='NDCG', help='Metric to optimize')
441			@common_options
442			def run_hyperopt(project_id, paths, trials, jobs, metric):
443			"""
444			Optimize the hyperparameters of a project using a validation corpus.
445			"""
446			proj = get_project(project_id)
447			documents = open_documents(paths)
448			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
449			rec = proj.hyperopt(documents, trials, jobs, metric)
450			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
451			click.echo("---")
452			for line in rec.lines:
453			click.echo(line)
454			click.echo("---")
455
456
457			if __name__ == '__main__':
458			cli()
459

NatLibFi / Annif

Pull Request — master (#414)

annif.cli.run_hyperopt() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like