annif.cli.run_optimize() - Code Metrics - Inspection of "Refactor: SuggestionBatch representing suggestion..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#681)

by Osma

created 2023-04-06 13:32 UTC

annif.cli.run_optimize() C

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	96
Code Lines	66

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	8
eloc	66
nop	5
dl	0
loc	96
rs	6.246
c	0
b	0
f	0

How to fix Long Method

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format("Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(min_access=Access.private).values():
        click.echo(
            template.format(
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
            )
        )


@cli.command("show-project")
@click.argument("project_id")
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {proj.modification_time}")


@cli.command("clear")
@click.argument("project_id")
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
    click.echo(header)
    click.echo("-" * len(header))
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))


@cli.command("load-vocab")
@click.argument("vocab_id")
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@click.argument("project_id")
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@click.argument("project_id")
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(
        directory, None, None, require_subjects=False
    )
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    from annif.suggestion import SuggestionResults

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import json
7			import os.path
8			import re
9			import sys
10
11			import click
12			import click_log
13			from flask.cli import FlaskGroup
14
15			import annif
16			import annif.corpus
17			import annif.parallel
18			import annif.project
19			import annif.registry
20			from annif import cli_util
21			from annif.exception import NotInitializedException, NotSupportedException
22			from annif.project import Access
23			from annif.util import metric_code
24
25			logger = annif.logger
26			click_log.basic_config(logger)
27
28			cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
29			cli = click.version_option(message="%(version)s")(cli)
30
31
32			@cli.command("list-projects")
33			@cli_util.common_options
34			@click_log.simple_verbosity_option(logger, default="ERROR")
35			def run_list_projects():
36			"""
37			List available projects.
38			\f
39			Show a list of currently defined projects. Projects are defined in a
40			configuration file, normally called ``projects.cfg``. See `Project
41			configuration
42			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
43			for details.
44			"""
45
46			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
47			header = template.format("Project ID", "Project Name", "Language", "Trained")
48			click.echo(header)
49			click.echo("-" * len(header))
50			for proj in annif.registry.get_projects(min_access=Access.private).values():
51			click.echo(
52			template.format(
53			proj.project_id, proj.name, proj.language, str(proj.is_trained)
54			)
55			)
56
57
58			@cli.command("show-project")
59			@click.argument("project_id")
60			@cli_util.common_options
61			def run_show_project(project_id):
62			"""
63			Show information about a project.
64			"""
65
66			proj = cli_util.get_project(project_id)
67			click.echo(f"Project ID: {proj.project_id}")
68			click.echo(f"Project Name: {proj.name}")
69			click.echo(f"Language: {proj.language}")
70			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
71			click.echo(f"Vocab language: {proj.vocab_lang}")
72			click.echo(f"Access: {proj.access.name}")
73			click.echo(f"Trained: {proj.is_trained}")
74			click.echo(f"Modification time: {proj.modification_time}")
75
76
77			@cli.command("clear")
78			@click.argument("project_id")
79			@cli_util.common_options
80			def run_clear_project(project_id):
81			"""
82			Initialize the project to its original, untrained state.
83			"""
84			proj = cli_util.get_project(project_id)
85			proj.remove_model_data()
86
87
88			@cli.command("list-vocabs")
89			@cli_util.common_options
90			@click_log.simple_verbosity_option(logger, default="ERROR")
91			def run_list_vocabs():
92			"""
93			List available vocabularies.
94			"""
95
96			template = "{0: <20}{1: <20}{2: >10} {3: <6}"
97			header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
98			click.echo(header)
99			click.echo("-" * len(header))
100			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
101			try:
102			languages = ",".join(sorted(vocab.languages))
103			size = len(vocab)
104			loaded = True
105			except NotInitializedException:
106			languages = "-"
107			size = "-"
108			loaded = False
109			click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
110
111
112			@cli.command("load-vocab")
113			@click.argument("vocab_id")
114			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
115			@click.option("--language", "-L", help="Language of subject file")
116			@click.option(
117			"--force",
118			"-f",
119			default=False,
120			is_flag=True,
121			help="Replace existing vocabulary completely instead of updating it",
122			)
123			@cli_util.common_options
124			def run_load_vocab(vocab_id, language, force, subjectfile):
125			"""
126			Load a vocabulary from a subject file.
127			"""
128			vocab = cli_util.get_vocab(vocab_id)
129			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
130			# SKOS/RDF file supported by rdflib
131			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
132			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
133			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
134			# CSV file
135			subjects = annif.corpus.SubjectFileCSV(subjectfile)
136			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
137			else:
138			# probably a TSV file - we need to know its language
139			if not language:
140			click.echo(
141			"Please use --language option to set the language of a TSV vocabulary.",
142			err=True,
143			)
144			sys.exit(1)
145			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
146			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
147			vocab.load_vocabulary(subjects, force=force)
148
149
150			@cli.command("train")
151			@click.argument("project_id")
152			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
153			@click.option(
154			"--cached/--no-cached",
155			"-c/-C",
156			default=False,
157			help="Reuse preprocessed training data from previous run",
158			)
159			@click.option(
160			"--jobs",
161			"-j",
162			default=0,
163			help="Number of parallel jobs (0 means choose automatically)",
164			)
165			@cli_util.docs_limit_option
166			@cli_util.backend_param_option
167			@cli_util.common_options
168			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
169			"""
170			Train a project on a collection of documents.
171			\f
172			This will train the project using the documents from ``PATHS`` (directories
173			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
174			is set, preprocessed training data from the previous run is reused instead
175			of documents input; see `Reusing preprocessed training data
176			<https://github.com/NatLibFi/Annif/wiki/
177			Reusing-preprocessed-training-data>`_.
178			"""
179			proj = cli_util.get_project(project_id)
180			backend_params = cli_util.parse_backend_params(backend_param, proj)
181			if cached:
182			if len(paths) > 0:
183			raise click.UsageError(
184			"Corpus paths cannot be given when using --cached option."
185			)
186			documents = "cached"
187			else:
188			documents = cli_util.open_documents(
189			paths, proj.subjects, proj.vocab_lang, docs_limit
190			)
191			proj.train(documents, backend_params, jobs)
192
193
194			@cli.command("learn")
195			@click.argument("project_id")
196			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
197			@cli_util.docs_limit_option
198			@cli_util.backend_param_option
199			@cli_util.common_options
200			def run_learn(project_id, paths, docs_limit, backend_param):
201			"""
202			Further train an existing project on a collection of documents.
203			\f
204			Similar to the ``train`` command. This will continue training an already
205			trained project using the documents given by ``PATHS`` in a single batch
206			operation. Not supported by all backends.
207			"""
208			proj = cli_util.get_project(project_id)
209			backend_params = cli_util.parse_backend_params(backend_param, proj)
210			documents = cli_util.open_documents(
211			paths, proj.subjects, proj.vocab_lang, docs_limit
212			)
213			proj.learn(documents, backend_params)
214
215
216			@cli.command("suggest")
217			@click.argument("project_id")
218			@click.argument(
219			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
220			)
221			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
222			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
223			@click.option("--language", "-L", help="Language of subject labels")
224			@cli_util.docs_limit_option
225			@cli_util.backend_param_option
226			@cli_util.common_options
227			def run_suggest(
228			project_id, paths, limit, threshold, language, backend_param, docs_limit
229			):
230			"""
231			Suggest subjects for a single document from standard input or for one or more
232			document file(s) given its/their path(s).
233			\f
234			This will read a text document from standard input and suggest subjects for
235			it, or if given path(s) to file(s), suggest subjects for it/them.
236			"""
237			project = cli_util.get_project(project_id)
238			lang = language or project.vocab_lang
239			if lang not in project.vocab.languages:
240			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
241			backend_params = cli_util.parse_backend_params(backend_param, project)
242
243			if paths and not (len(paths) == 1 and paths[0] == "-"):
244			docs = cli_util.open_text_documents(paths, docs_limit)
245			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
246			for (
247			suggestions,
248			path,
249			) in zip(results, paths):
250			click.echo(f"Suggestions for {path}")
251			cli_util.show_hits(suggestions, project, lang)
252			else:
253			text = sys.stdin.read()
254			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
255			0
256			]
257			cli_util.show_hits(suggestions, project, lang)
258
259
260			@cli.command("index")
261			@click.argument("project_id")
262			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
263			@click.option(
264			"--suffix", "-s", default=".annif", help="File name suffix for result files"
265			)
266			@click.option(
267			"--force/--no-force",
268			"-f/-F",
269			default=False,
270			help="Force overwriting of existing result files",
271			)
272			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
273			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
274			@click.option("--language", "-L", help="Language of subject labels")
275			@cli_util.backend_param_option
276			@cli_util.common_options
277			def run_index(
278			project_id, directory, suffix, force, limit, threshold, language, backend_param
279			):
280			"""
281			Index a directory with documents, suggesting subjects for each document.
282			Write the results in TSV files with the given suffix (``.annif`` by
283			default).
284			"""
285			project = cli_util.get_project(project_id)
286			lang = language or project.vocab_lang
287			if lang not in project.vocab.languages:
288			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
289			backend_params = cli_util.parse_backend_params(backend_param, project)
290
291			documents = annif.corpus.DocumentDirectory(
292			directory, None, None, require_subjects=False
293			)
294			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
295
296			for (docfilename, _), suggestions in zip(documents, results):
297			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
298			if os.path.exists(subjectfilename) and not force:
299			click.echo(
300			"Not overwriting {} (use --force to override)".format(subjectfilename)
301			)
302			continue
303			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
304			cli_util.show_hits(suggestions, project, lang, file=subjfile)
305
306
307			@cli.command("eval")
308			@click.argument("project_id")
309			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
310			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
311			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
312			@click.option(
313			"--metric",
314			"-m",
315			default=[],
316			multiple=True,
317			help="Metric to calculate (default: all)",
318			)
319			@click.option(
320			"--metrics-file",
321			"-M",
322			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
323			help="""Specify file in order to write evaluation metrics in JSON format.
324			File directory must exist, existing file will be overwritten.""",
325			)
326			@click.option(
327			"--results-file",
328			"-r",
329			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
330			help="""Specify file in order to write non-aggregated results per subject.
331			File directory must exist, existing file will be overwritten.""",
332			)
333			@click.option(
334			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
335			)
336			@cli_util.docs_limit_option
337			@cli_util.backend_param_option
338			@cli_util.common_options
339			def run_eval(
340			project_id,
341			paths,
342			limit,
343			threshold,
344			docs_limit,
345			metric,
346			metrics_file,
347			results_file,
348			jobs,
349			backend_param,
350			):
351			"""
352			Suggest subjects for documents and evaluate the results by comparing
353			against a gold standard.
354			\f
355			With this command the documents from ``PATHS`` (directories or possibly
356			gzipped TSV files) will be assigned subject suggestions and then
357			statistical measures are calculated that quantify how well the suggested
358			subjects match the gold-standard subjects in the documents.
359
360			Normally the output is the list of the metrics calculated across documents.
361			If ``--results-file <FILENAME>`` option is given, the metrics are
362			calculated separately for each subject, and written to the given file.
363			"""
364
365			project = cli_util.get_project(project_id)
366			backend_params = cli_util.parse_backend_params(backend_param, project)
367
368			import annif.eval
369
370			eval_batch = annif.eval.EvaluationBatch(project.subjects)
371
372			if results_file:
373			try:
374			print("", end="", file=results_file)
375			click.echo(
376			"Writing per subject evaluation results to {!s}".format(
377			results_file.name
378			)
379			)
380			except Exception as e:
381			raise NotSupportedException(
382			"cannot open results-file for writing: " + str(e)
383			)
384			corpus = cli_util.open_documents(
385			paths, project.subjects, project.vocab_lang, docs_limit
386			)
387			jobs, pool_class = annif.parallel.get_pool(jobs)
388
389			project.initialize(parallel=True)
390			psmap = annif.parallel.ProjectSuggestMap(
391			project.registry, [project_id], backend_params, limit, threshold
392			)
393
394			with pool_class(jobs) as pool:
395			for hit_sets, subject_sets in pool.imap_unordered(
396			psmap.suggest_batch, corpus.doc_batches
397			):
398			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
399
400			template = "{0:<30}\t{1}"
401			metrics = eval_batch.results(
402			metrics=metric, results_file=results_file, language=project.vocab_lang
403			)
404			for metric, score in metrics.items():
405			click.echo(template.format(metric + ":", score))
406			if metrics_file:
407			json.dump(
408			{metric_code(mname): val for mname, val in metrics.items()},
409			metrics_file,
410			indent=2,
411			)
412
413
414			FILTER_BATCH_MAX_LIMIT = 15
415			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
416
417
418			@cli.command("optimize")
419			@click.argument("project_id")
420			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
421			@click.option(
422			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
423			)
424			@cli_util.docs_limit_option
425			@cli_util.backend_param_option
426			@cli_util.common_options
427			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
428			"""
429			Suggest subjects for documents, testing multiple limits and thresholds.
430			\f
431			This command will use different limit (maximum number of subjects) and
432			score threshold values when assigning subjects to each document given by
433			``PATHS`` and compare the results against the gold standard subjects in the
434			documents. The output is a list of parameter combinations and their scores.
435			From the output, you can determine the optimum limit and threshold
436			parameters depending on which measure you want to target.
437			"""
438			project = cli_util.get_project(project_id)
439			backend_params = cli_util.parse_backend_params(backend_param, project)
440			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
441
442			import annif.eval
443
444			corpus = cli_util.open_documents(
445			paths, project.subjects, project.vocab_lang, docs_limit
446			)
447
448			jobs, pool_class = annif.parallel.get_pool(jobs)
449
450			project.initialize(parallel=True)
451			psmap = annif.parallel.ProjectSuggestMap(
452			project.registry,
453			[project_id],
454			backend_params,
455			limit=FILTER_BATCH_MAX_LIMIT,
456			threshold=0.0,
457			)
458
459			ndocs = 0
460			suggestion_batches = []
461			subject_set_batches = []
462			with pool_class(jobs) as pool:
463			for suggestion_batch, subject_sets in pool.imap_unordered(
464			psmap.suggest_batch, corpus.doc_batches
465			):
466			ndocs += len(suggestion_batch[project_id])
467			suggestion_batches.append(suggestion_batch[project_id])
468			subject_set_batches.append(subject_sets)
469
470			from annif.suggestion import SuggestionResults
471
472			orig_suggestion_results = SuggestionResults(suggestion_batches)
473
474			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
475
476			best_scores = collections.defaultdict(float)
477			best_params = {}
478
479			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
480			import annif.eval
481
482			for limit, threshold in filter_params:
483			eval_batch = annif.eval.EvaluationBatch(project.subjects)
484			filtered_results = orig_suggestion_results.filter(limit, threshold)
485			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
486			eval_batch.evaluate_many(batch, subject_sets)
487			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
488			for metric, score in results.items():
489			if score >= best_scores[metric]:
490			best_scores[metric] = score
491			best_params[metric] = (limit, threshold)
492			click.echo(
493			template.format(
494			limit,
495			threshold,
496			results["Precision (doc avg)"],
497			results["Recall (doc avg)"],
498			results["F1 score (doc avg)"],
499			)
500			)
501
502			click.echo()
503			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
504			for metric in OPTIMIZE_METRICS:
505			click.echo(
506			template2.format(
507			metric,
508			best_scores[metric],
509			best_params[metric][0],
510			best_params[metric][1],
511			)
512			)
513			click.echo("Documents evaluated:\t{}".format(ndocs))
514
515
516			@cli.command("hyperopt")
517			@click.argument("project_id")
518			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
519			@click.option("--trials", "-T", default=10, help="Number of trials")
520			@click.option(
521			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
522			)
523			@click.option(
524			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
525			)
526			@click.option(
527			"--results-file",
528			"-r",
529			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
530			help="""Specify file path to write trial results as CSV.
531			File directory must exist, existing file will be overwritten.""",
532			)
533			@cli_util.docs_limit_option
534			@cli_util.common_options
535			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
536			"""
537			Optimize the hyperparameters of a project using validation documents from
538			``PATHS``. Not supported by all backends. Output is a list of trial results
539			and a report of the best performing parameters.
540			"""
541			proj = cli_util.get_project(project_id)
542			documents = cli_util.open_documents(
543			paths, proj.subjects, proj.vocab_lang, docs_limit
544			)
545			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
546			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
547			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
548			click.echo("---")
549			for line in rec.lines:
550			click.echo(line)
551			click.echo("---")
552
553
554			if __name__ == "__main__":
555			cli()
556

NatLibFi / Annif

Pull Request — main (#681)

annif.cli.run_optimize() C

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like