annif.cli.run_hyperopt() - Code Metrics - Inspection of "Initial implementation: hyperparameter optimizatio..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#414)

by Osma

created 2020-05-18 07:09 UTC

annif.cli.run_hyperopt() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	15
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	12
dl	0
loc	15
rs	9.8
c	0
b	0
f	0
cc	2
nop	3

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.eval
import annif.project
from annif.project import Access
from annif.suggestion import SuggestionFilter
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.project.get_project(project_id, min_access=Access.hidden)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths):
    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    return docs


def parse_backend_params(backend_param, project):
    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if 'algorithm' in beparam:
        raise NotSupportedException('Algorithm overriding not supported.')
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')
@common_options
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <8}"
    header = template.format("Project ID", "Project Name", "Language")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.project.get_projects(min_access=Access.private).values():
        click.echo(template.format(proj.project_id, proj.name, proj.language))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    template = "{0:<20}{1}"
    click.echo(template.format('Project ID:', proj.project_id))
    click.echo(template.format('Project Name:', proj.name))
    click.echo(template.format('Language:', proj.language))
    click.echo(template.format('Access:', proj.access.name))


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', default=False,
              help='Reuse preprocessed training data from previous run')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths)
    proj.train(documents, backend_params)


@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_learn(project_id, paths, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths)
    proj.learn(documents, backend_params)


@cli.command('suggest')
@click.argument('project_id')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits:
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option(
    '--results-file',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exists, existing file will be overwritten.""")
@backend_param_option
@common_options
def run_eval(project_id, paths, limit, threshold, results_file, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<30}\t{1}"
    for metric, score in eval_batch.results(results_file=results_file).items():
        click.echo(template.format(metric + ":", score))


@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@backend_param_option
@common_options
def run_optimize(project_id, paths, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths)
    for doc in docs.documents:
        hits = project.suggest(doc.text, backend_params)
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        results = filter_batch[1].results(metrics='simple')
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in ('Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)',
                   'NDCG@5',
                   'NDCG@10'):
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')
@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--trials', default=10, help='Number of trials')
@common_options
def run_hyperopt(project_id, paths, trials):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths)
    best, score = proj.hyperopt(documents, trials)
    click.echo(f"Best NDCG score {score} with the following hyperparameters:")
    for param, value in best.items():
        click.echo(f"{param}:\t{value}")


if __name__ == '__main__':
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import os.path
7			import re
8			import sys
9			import click
10			import click_log
11			from flask import current_app
12			from flask.cli import FlaskGroup, ScriptInfo
13			import annif
14			import annif.corpus
15			import annif.eval
16			import annif.project
17			from annif.project import Access
18			from annif.suggestion import SuggestionFilter
19			from annif.exception import ConfigurationException, NotSupportedException
20
21			logger = annif.logger
22			click_log.basic_config(logger)
23
24			cli = FlaskGroup(create_app=annif.create_app)
25
26
27			def get_project(project_id):
28			"""
29			Helper function to get a project by ID and bail out if it doesn't exist"""
30			try:
31			return annif.project.get_project(project_id, min_access=Access.hidden)
32			except ValueError:
33			click.echo(
34			"No projects found with id \'{0}\'.".format(project_id),
35			err=True)
36			sys.exit(1)
37
38
39			def open_documents(paths):
40			"""Helper function to open a document corpus from a list of pathnames,
41			each of which is either a TSV file or a directory of TXT files. The
42			corpus will be returned as an instance of DocumentCorpus."""
43
44			def open_doc_path(path):
45			"""open a single path and return it as a DocumentCorpus"""
46			if os.path.isdir(path):
47			return annif.corpus.DocumentDirectory(path, require_subjects=True)
48			return annif.corpus.DocumentFile(path)
49
50			if len(paths) == 0:
51			logger.warning('Reading empty file')
52			docs = open_doc_path(os.path.devnull)
53			elif len(paths) == 1:
54			docs = open_doc_path(paths[0])
55			else:
56			corpora = [open_doc_path(path) for path in paths]
57			docs = annif.corpus.CombinedCorpus(corpora)
58			return docs
59
60
61			def parse_backend_params(backend_param, project):
62			"""Parse a list of backend parameters given with the --backend-param
63			option into a nested dict structure"""
64			backend_params = collections.defaultdict(dict)
65			for beparam in backend_param:
66			backend, param = beparam.split('.', 1)
67			key, val = param.split('=', 1)
68			validate_backend_params(backend, beparam, project)
69			backend_params[backend][key] = val
70			return backend_params
71
72
73			def validate_backend_params(backend, beparam, project):
74			if 'algorithm' in beparam:
75			raise NotSupportedException('Algorithm overriding not supported.')
76			if backend != project.config['backend']:
77			raise ConfigurationException(
78			'The backend {} in CLI option "-b {}" not matching the project'
79			' backend {}.'
80			.format(backend, beparam, project.config['backend']))
81
82
83			def generate_filter_batches(subjects):
84			filter_batches = collections.OrderedDict()
85			for limit in range(1, 16):
86			for threshold in [i * 0.05 for i in range(20)]:
87			hit_filter = SuggestionFilter(limit, threshold)
88			batch = annif.eval.EvaluationBatch(subjects)
89			filter_batches[(limit, threshold)] = (hit_filter, batch)
90			return filter_batches
91
92
93			def set_project_config_file_path(ctx, param, value):
94			"""Override the default path or the path given in env by CLI option"""
95			with ctx.ensure_object(ScriptInfo).load_app().app_context():
96			if value:
97			current_app.config['PROJECTS_FILE'] = value
98
99
100			def common_options(f):
101			"""Decorator to add common options for all CLI commands"""
102			f = click.option(
103			'-p', '--projects', help='Set path to projects.cfg',
104			type=click.Path(dir_okay=False, exists=True),
105			callback=set_project_config_file_path, expose_value=False,
106			is_eager=True)(f)
107			return click_log.simple_verbosity_option(logger)(f)
108
109
110			def backend_param_option(f):
111			"""Decorator to add an option for CLI commands to override BE parameters"""
112			return click.option(
113			'--backend-param', '-b', multiple=True,
114			help='Override backend parameter of the config file. ' +
115			'Syntax: "-b <backend>.<parameter>=<value>".')(f)
116
117
118			@cli.command('list-projects')
119			@common_options
120			def run_list_projects():
121			"""
122			List available projects.
123			"""
124
125			template = "{0: <25}{1: <45}{2: <8}"
126			header = template.format("Project ID", "Project Name", "Language")
127			click.echo(header)
128			click.echo("-" * len(header))
129			for proj in annif.project.get_projects(min_access=Access.private).values():
130			click.echo(template.format(proj.project_id, proj.name, proj.language))
131
132
133			@cli.command('show-project')
134			@click.argument('project_id')
135			@common_options
136			def run_show_project(project_id):
137			"""
138			Show information about a project.
139			"""
140
141			proj = get_project(project_id)
142			template = "{0:<20}{1}"
143			click.echo(template.format('Project ID:', proj.project_id))
144			click.echo(template.format('Project Name:', proj.name))
145			click.echo(template.format('Language:', proj.language))
146			click.echo(template.format('Access:', proj.access.name))
147
148
149			@cli.command('clear')
150			@click.argument('project_id')
151			@common_options
152			def run_clear_project(project_id):
153			"""
154			Initialize the project to its original, untrained state.
155			"""
156			proj = get_project(project_id)
157			proj.remove_model_data()
158
159
160			@cli.command('loadvoc')
161			@click.argument('project_id')
162			@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
163			@common_options
164			def run_loadvoc(project_id, subjectfile):
165			"""
166			Load a vocabulary for a project.
167			"""
168			proj = get_project(project_id)
169			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
170			# SKOS/RDF file supported by rdflib
171			subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
172			else:
173			# probably a TSV file
174			subjects = annif.corpus.SubjectFileTSV(subjectfile)
175			proj.vocab.load_vocabulary(subjects, proj.language)
176
177
178			@cli.command('train')
179			@click.argument('project_id')
180			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
181			@click.option('--cached/--no-cached', default=False,
182			help='Reuse preprocessed training data from previous run')
183			@backend_param_option
184			@common_options
185			def run_train(project_id, paths, cached, backend_param):
186			"""
187			Train a project on a collection of documents.
188			"""
189			proj = get_project(project_id)
190			backend_params = parse_backend_params(backend_param, proj)
191			if cached:
192			if len(paths) > 0:
193			raise click.UsageError(
194			"Corpus paths cannot be given when using --cached option.")
195			documents = 'cached'
196			else:
197			documents = open_documents(paths)
198			proj.train(documents, backend_params)
199
200
201			@cli.command('learn')
202			@click.argument('project_id')
203			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
204			@backend_param_option
205			@common_options
206			def run_learn(project_id, paths, backend_param):
207			"""
208			Further train an existing project on a collection of documents.
209			"""
210			proj = get_project(project_id)
211			backend_params = parse_backend_params(backend_param, proj)
212			documents = open_documents(paths)
213			proj.learn(documents, backend_params)
214
215
216			@cli.command('suggest')
217			@click.argument('project_id')
218			@click.option('--limit', default=10, help='Maximum number of subjects')
219			@click.option('--threshold', default=0.0, help='Minimum score threshold')
220			@backend_param_option
221			@common_options
222			def run_suggest(project_id, limit, threshold, backend_param):
223			"""
224			Suggest subjects for a single document from standard input.
225			"""
226			project = get_project(project_id)
227			text = sys.stdin.read()
228			backend_params = parse_backend_params(backend_param, project)
229			hit_filter = SuggestionFilter(limit, threshold)
230			hits = hit_filter(project.suggest(text, backend_params))
231			for hit in hits:
232			click.echo(
233			"<{}>\t{}\t{}".format(
234			hit.uri,
235			'\t'.join(filter(None, (hit.label, hit.notation))),
236			hit.score))
237
238
239			@cli.command('index')
240			@click.argument('project_id')
241			@click.argument('directory', type=click.Path(exists=True, file_okay=False))
242			@click.option(
243			'--suffix',
244			default='.annif',
245			help='File name suffix for result files')
246			@click.option('--force/--no-force', default=False,
247			help='Force overwriting of existing result files')
248			@click.option('--limit', default=10, help='Maximum number of subjects')
249			@click.option('--threshold', default=0.0, help='Minimum score threshold')
250			@backend_param_option
251			@common_options
252			def run_index(project_id, directory, suffix, force,
253			limit, threshold, backend_param):
254			"""
255			Index a directory with documents, suggesting subjects for each document.
256			Write the results in TSV files with the given suffix.
257			"""
258			project = get_project(project_id)
259			backend_params = parse_backend_params(backend_param, project)
260			hit_filter = SuggestionFilter(limit, threshold)
261
262			for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
263			directory, require_subjects=False):
264			with open(docfilename, encoding='utf-8-sig') as docfile:
265			text = docfile.read()
266			subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
267			if os.path.exists(subjectfilename) and not force:
268			click.echo(
269			"Not overwriting {} (use --force to override)".format(
270			subjectfilename))
271			continue
272			with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
273			results = project.suggest(text, backend_params)
274			for hit in hit_filter(results):
275			line = "<{}>\t{}\t{}".format(
276			hit.uri,
277			'\t'.join(filter(None, (hit.label, hit.notation))),
278			hit.score)
279			click.echo(line, file=subjfile)
280
281
282			@cli.command('eval')
283			@click.argument('project_id')
284			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
285			@click.option('--limit', default=10, help='Maximum number of subjects')
286			@click.option('--threshold', default=0.0, help='Minimum score threshold')
287			@click.option(
288			'--results-file',
289			type=click.File(
290			'w',
291			encoding='utf-8',
292			errors='ignore',
293			lazy=True),
294			help="""Specify file in order to write non-aggregated results per subject.
295			File directory must exists, existing file will be overwritten.""")
296			@backend_param_option
297			@common_options
298			def run_eval(project_id, paths, limit, threshold, results_file, backend_param):
299			"""
300			Analyze documents and evaluate the result.
301
302			Compare the results of automated indexing against a gold standard. The
303			path may be either a TSV file with short documents or a directory with
304			documents in separate files.
305			"""
306
307			project = get_project(project_id)
308			backend_params = parse_backend_params(backend_param, project)
309
310			hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
311			eval_batch = annif.eval.EvaluationBatch(project.subjects)
312
313			if results_file:
314			try:
315			print('', end='', file=results_file)
316			click.echo('Writing per subject evaluation results to {!s}'.format(
317			results_file.name))
318			except Exception as e:
319			raise NotSupportedException(
320			"cannot open results-file for writing: " + str(e))
321			docs = open_documents(paths)
322			for doc in docs.documents:
323			results = project.suggest(doc.text, backend_params)
324			hits = hit_filter(results)
325			eval_batch.evaluate(hits,
326			annif.corpus.SubjectSet((doc.uris, doc.labels)))
327
328			template = "{0:<30}\t{1}"
329			for metric, score in eval_batch.results(results_file=results_file).items():
330			click.echo(template.format(metric + ":", score))
331
332
333			@cli.command('optimize')
334			@click.argument('project_id')
335			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
336			@backend_param_option
337			@common_options
338			def run_optimize(project_id, paths, backend_param):
339			"""
340			Analyze documents, testing multiple limits and thresholds.
341
342			Evaluate the analysis results for a directory with documents against a
343			gold standard given in subject files. Test different limit/threshold
344			values and report the precision, recall and F-measure of each combination
345			of settings.
346			"""
347			project = get_project(project_id)
348			backend_params = parse_backend_params(backend_param, project)
349
350			filter_batches = generate_filter_batches(project.subjects)
351
352			ndocs = 0
353			docs = open_documents(paths)
354			for doc in docs.documents:
355			hits = project.suggest(doc.text, backend_params)
356			gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
357			for hit_filter, batch in filter_batches.values():
358			batch.evaluate(hit_filter(hits), gold_subjects)
359			ndocs += 1
360
361			click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
362
363			best_scores = collections.defaultdict(float)
364			best_params = {}
365
366			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
367			# Store the batches in a list that gets consumed along the way
368			# This way GC will have a chance to reclaim the memory
369			filter_batches = list(filter_batches.items())
370			while filter_batches:
371			params, filter_batch = filter_batches.pop(0)
372			results = filter_batch[1].results(metrics='simple')
373			for metric, score in results.items():
374			if score >= best_scores[metric]:
375			best_scores[metric] = score
376			best_params[metric] = params
377			click.echo(
378			template.format(
379			params[0],
380			params[1],
381			results['Precision (doc avg)'],
382			results['Recall (doc avg)'],
383			results['F1 score (doc avg)']))
384
385			click.echo()
386			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
387			for metric in ('Precision (doc avg)',
388			'Recall (doc avg)',
389			'F1 score (doc avg)',
390			'NDCG@5',
391			'NDCG@10'):
392			click.echo(
393			template2.format(
394			metric,
395			best_scores[metric],
396			best_params[metric][0],
397			best_params[metric][1]))
398			click.echo("Documents evaluated:\t{}".format(ndocs))
399
400
401			@cli.command('hyperopt')
402			@click.argument('project_id')
403			@click.argument('paths', type=click.Path(exists=True), nargs=-1)
404			@click.option('--trials', default=10, help='Number of trials')
405			@common_options
406			def run_hyperopt(project_id, paths, trials):
407			"""
408			Optimize the hyperparameters of a project using a validation corpus.
409			"""
410			proj = get_project(project_id)
411			documents = open_documents(paths)
412			best, score = proj.hyperopt(documents, trials)
413			click.echo(f"Best NDCG score {score} with the following hyperparameters:")
414			for param, value in best.items():
415			click.echo(f"{param}:\t{value}")
416
417
418			if __name__ == '__main__':
419			cli()
420

NatLibFi / Annif

Pull Request — master (#414)

annif.cli.run_hyperopt() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like