annif.cli.run_optimize() - Code Metrics - Inspection of "Refactor and cleanup CLI module" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#675)

by Juho

created 2023-02-21 15:48 UTC

annif.cli.run_optimize() B

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	79
Code Lines	55

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	55
nop	4
dl	0
loc	79
rs	7.0727
c	0
b	0
f	0

How to fix Long Method

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif.cli_util import (
    backend_param_option,
    common_options,
    docs_limit_option,
    generate_filter_batches,
    get_project,
    get_vocab,
    open_documents,
    open_text_documents,
    parse_backend_params,
    show_hits,
)
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.suggestion import ListSuggestionResult, SuggestionFilter
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format("Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(min_access=Access.private).values():
        click.echo(
            template.format(
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
            )
        )


@cli.command("show-project")
@click.argument("project_id")
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {proj.modification_time}")


@cli.command("clear")
@click.argument("project_id")
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
    click.echo(header)
    click.echo("-" * len(header))
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))


@cli.command("load-vocab")
@click.argument("vocab_id")
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@docs_limit_option
@backend_param_option
@common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@docs_limit_option
@backend_param_option
@common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
    proj.learn(documents, backend_params)


@cli.command("suggest")
@click.argument("project_id")
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@docs_limit_option
@backend_param_option
@common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = open_text_documents(paths, docs_limit)
        subject_sets = project.suggest_corpus(docs, backend_params)
        for (
            subjects,
            path,
        ) in zip(subject_sets, paths):
            click.echo(f"Suggestions for {path}")
            hits = hit_filter(subjects)
            show_hits(hits, project, lang)
    else:
        text = sys.stdin.read()
        hits = hit_filter(project.suggest([text], backend_params)[0])
        show_hits(hits, project, lang)


@cli.command("index")
@click.argument("project_id")
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@backend_param_option
@common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    documents = annif.corpus.DocumentDirectory(
        directory, None, None, require_subjects=False
    )
    subject_sets = project.suggest_corpus(documents, backend_params)

    for (docfilename, _), subjects in zip(documents, subject_sets):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        hits = hit_filter(subjects)
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            show_hits(hits, project, lang, file=subjfile)


@cli.command("eval")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@docs_limit_option
@backend_param_option
@common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15


@cli.command("optimize")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@docs_limit_option
@backend_param_option
@common_options
def run_optimize(project_id, paths, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects, FILTER_BATCH_MAX_LIMIT)

    ndocs = 0
    corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
    for docs_batch in corpus.doc_batches:
        texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in docs_batch])
        raw_hit_sets = project.suggest(texts, backend_params)
        hit_sets = [
            raw_hits.filter(project.subjects, limit=FILTER_BATCH_MAX_LIMIT)
            for raw_hits in raw_hit_sets
        ]
        assert isinstance(hit_sets[0], ListSuggestionResult), (
            "Optimize should only be done with ListSuggestionResult "
            + "as it would be very slow with VectorSuggestionResult."
        )
        for hit_filter, filter_batch in filter_batches.values():
            filtered_hits = [hit_filter(hits) for hits in hit_sets]
            filter_batch.evaluate_many(filtered_hits, subject_sets)
        ndocs += len(texts)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:

        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@docs_limit_option
@common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = get_project(project_id)
    documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import json
7			import os.path
8			import re
9			import sys
10
11			import click
12			import click_log
13			from flask.cli import FlaskGroup
14
15			import annif
16			import annif.corpus
17			import annif.parallel
18			import annif.project
19			import annif.registry
20			from annif.cli_util import (
21			backend_param_option,
22			common_options,
23			docs_limit_option,
24			generate_filter_batches,
25			get_project,
26			get_vocab,
27			open_documents,
28			open_text_documents,
29			parse_backend_params,
30			show_hits,
31			)
32			from annif.exception import NotInitializedException, NotSupportedException
33			from annif.project import Access
34			from annif.suggestion import ListSuggestionResult, SuggestionFilter
35			from annif.util import metric_code
36
37			logger = annif.logger
38			click_log.basic_config(logger)
39
40			cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
41			cli = click.version_option(message="%(version)s")(cli)
42
43
44			@cli.command("list-projects")
45			@common_options
46			@click_log.simple_verbosity_option(logger, default="ERROR")
47			def run_list_projects():
48			"""
49			List available projects.
50			\f
51			Show a list of currently defined projects. Projects are defined in a
52			configuration file, normally called ``projects.cfg``. See `Project
53			configuration
54			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
55			for details.
56			"""
57
58			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
59			header = template.format("Project ID", "Project Name", "Language", "Trained")
60			click.echo(header)
61			click.echo("-" * len(header))
62			for proj in annif.registry.get_projects(min_access=Access.private).values():
63			click.echo(
64			template.format(
65			proj.project_id, proj.name, proj.language, str(proj.is_trained)
66			)
67			)
68
69
70			@cli.command("show-project")
71			@click.argument("project_id")
72			@common_options
73			def run_show_project(project_id):
74			"""
75			Show information about a project.
76			"""
77
78			proj = get_project(project_id)
79			click.echo(f"Project ID: {proj.project_id}")
80			click.echo(f"Project Name: {proj.name}")
81			click.echo(f"Language: {proj.language}")
82			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
83			click.echo(f"Vocab language: {proj.vocab_lang}")
84			click.echo(f"Access: {proj.access.name}")
85			click.echo(f"Trained: {proj.is_trained}")
86			click.echo(f"Modification time: {proj.modification_time}")
87
88
89			@cli.command("clear")
90			@click.argument("project_id")
91			@common_options
92			def run_clear_project(project_id):
93			"""
94			Initialize the project to its original, untrained state.
95			"""
96			proj = get_project(project_id)
97			proj.remove_model_data()
98
99
100			@cli.command("list-vocabs")
101			@common_options
102			@click_log.simple_verbosity_option(logger, default="ERROR")
103			def run_list_vocabs():
104			"""
105			List available vocabularies.
106			"""
107
108			template = "{0: <20}{1: <20}{2: >10} {3: <6}"
109			header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
110			click.echo(header)
111			click.echo("-" * len(header))
112			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
113			try:
114			languages = ",".join(sorted(vocab.languages))
115			size = len(vocab)
116			loaded = True
117			except NotInitializedException:
118			languages = "-"
119			size = "-"
120			loaded = False
121			click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
122
123
124			@cli.command("load-vocab")
125			@click.argument("vocab_id")
126			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
127			@click.option("--language", "-L", help="Language of subject file")
128			@click.option(
129			"--force",
130			"-f",
131			default=False,
132			is_flag=True,
133			help="Replace existing vocabulary completely instead of updating it",
134			)
135			@common_options
136			def run_load_vocab(vocab_id, language, force, subjectfile):
137			"""
138			Load a vocabulary from a subject file.
139			"""
140			vocab = get_vocab(vocab_id)
141			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
142			# SKOS/RDF file supported by rdflib
143			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
144			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
145			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
146			# CSV file
147			subjects = annif.corpus.SubjectFileCSV(subjectfile)
148			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
149			else:
150			# probably a TSV file - we need to know its language
151			if not language:
152			click.echo(
153			"Please use --language option to set the language of a TSV vocabulary.",
154			err=True,
155			)
156			sys.exit(1)
157			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
158			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
159			vocab.load_vocabulary(subjects, force=force)
160
161
162			@cli.command("train")
163			@click.argument("project_id")
164			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
165			@click.option(
166			"--cached/--no-cached",
167			"-c/-C",
168			default=False,
169			help="Reuse preprocessed training data from previous run",
170			)
171			@click.option(
172			"--jobs",
173			"-j",
174			default=0,
175			help="Number of parallel jobs (0 means choose automatically)",
176			)
177			@docs_limit_option
178			@backend_param_option
179			@common_options
180			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
181			"""
182			Train a project on a collection of documents.
183			\f
184			This will train the project using the documents from ``PATHS`` (directories
185			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
186			is set, preprocessed training data from the previous run is reused instead
187			of documents input; see `Reusing preprocessed training data
188			<https://github.com/NatLibFi/Annif/wiki/
189			Reusing-preprocessed-training-data>`_.
190			"""
191			proj = get_project(project_id)
192			backend_params = parse_backend_params(backend_param, proj)
193			if cached:
194			if len(paths) > 0:
195			raise click.UsageError(
196			"Corpus paths cannot be given when using --cached option."
197			)
198			documents = "cached"
199			else:
200			documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
201			proj.train(documents, backend_params, jobs)
202
203
204			@cli.command("learn")
205			@click.argument("project_id")
206			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
207			@docs_limit_option
208			@backend_param_option
209			@common_options
210			def run_learn(project_id, paths, docs_limit, backend_param):
211			"""
212			Further train an existing project on a collection of documents.
213			\f
214			Similar to the ``train`` command. This will continue training an already
215			trained project using the documents given by ``PATHS`` in a single batch
216			operation. Not supported by all backends.
217			"""
218			proj = get_project(project_id)
219			backend_params = parse_backend_params(backend_param, proj)
220			documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
221			proj.learn(documents, backend_params)
222
223
224			@cli.command("suggest")
225			@click.argument("project_id")
226			@click.argument(
227			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
228			)
229			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
230			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
231			@click.option("--language", "-L", help="Language of subject labels")
232			@docs_limit_option
233			@backend_param_option
234			@common_options
235			def run_suggest(
236			project_id, paths, limit, threshold, language, backend_param, docs_limit
237			):
238			"""
239			Suggest subjects for a single document from standard input or for one or more
240			document file(s) given its/their path(s).
241			\f
242			This will read a text document from standard input and suggest subjects for
243			it, or if given path(s) to file(s), suggest subjects for it/them.
244			"""
245			project = get_project(project_id)
246			lang = language or project.vocab_lang
247			if lang not in project.vocab.languages:
248			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
249			backend_params = parse_backend_params(backend_param, project)
250			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
251
252			if paths and not (len(paths) == 1 and paths[0] == "-"):
253			docs = open_text_documents(paths, docs_limit)
254			subject_sets = project.suggest_corpus(docs, backend_params)
255			for (
256			subjects,
257			path,
258			) in zip(subject_sets, paths):
259			click.echo(f"Suggestions for {path}")
260			hits = hit_filter(subjects)
261			show_hits(hits, project, lang)
262			else:
263			text = sys.stdin.read()
264			hits = hit_filter(project.suggest([text], backend_params)[0])
265			show_hits(hits, project, lang)
266
267
268			@cli.command("index")
269			@click.argument("project_id")
270			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
271			@click.option(
272			"--suffix", "-s", default=".annif", help="File name suffix for result files"
273			)
274			@click.option(
275			"--force/--no-force",
276			"-f/-F",
277			default=False,
278			help="Force overwriting of existing result files",
279			)
280			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
281			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
282			@click.option("--language", "-L", help="Language of subject labels")
283			@backend_param_option
284			@common_options
285			def run_index(
286			project_id, directory, suffix, force, limit, threshold, language, backend_param
287			):
288			"""
289			Index a directory with documents, suggesting subjects for each document.
290			Write the results in TSV files with the given suffix (``.annif`` by
291			default).
292			"""
293			project = get_project(project_id)
294			lang = language or project.vocab_lang
295			if lang not in project.vocab.languages:
296			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
297			backend_params = parse_backend_params(backend_param, project)
298			hit_filter = SuggestionFilter(project.subjects, limit, threshold)
299
300			documents = annif.corpus.DocumentDirectory(
301			directory, None, None, require_subjects=False
302			)
303			subject_sets = project.suggest_corpus(documents, backend_params)
304
305			for (docfilename, _), subjects in zip(documents, subject_sets):
306			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
307			if os.path.exists(subjectfilename) and not force:
308			click.echo(
309			"Not overwriting {} (use --force to override)".format(subjectfilename)
310			)
311			continue
312			hits = hit_filter(subjects)
313			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
314			show_hits(hits, project, lang, file=subjfile)
315
316
317			@cli.command("eval")
318			@click.argument("project_id")
319			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
320			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
321			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
322			@click.option(
323			"--metric",
324			"-m",
325			default=[],
326			multiple=True,
327			help="Metric to calculate (default: all)",
328			)
329			@click.option(
330			"--metrics-file",
331			"-M",
332			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
333			help="""Specify file in order to write evaluation metrics in JSON format.
334			File directory must exist, existing file will be overwritten.""",
335			)
336			@click.option(
337			"--results-file",
338			"-r",
339			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
340			help="""Specify file in order to write non-aggregated results per subject.
341			File directory must exist, existing file will be overwritten.""",
342			)
343			@click.option(
344			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
345			)
346			@docs_limit_option
347			@backend_param_option
348			@common_options
349			def run_eval(
350			project_id,
351			paths,
352			limit,
353			threshold,
354			docs_limit,
355			metric,
356			metrics_file,
357			results_file,
358			jobs,
359			backend_param,
360			):
361			"""
362			Suggest subjects for documents and evaluate the results by comparing
363			against a gold standard.
364			\f
365			With this command the documents from ``PATHS`` (directories or possibly
366			gzipped TSV files) will be assigned subject suggestions and then
367			statistical measures are calculated that quantify how well the suggested
368			subjects match the gold-standard subjects in the documents.
369
370			Normally the output is the list of the metrics calculated across documents.
371			If ``--results-file <FILENAME>`` option is given, the metrics are
372			calculated separately for each subject, and written to the given file.
373			"""
374
375			project = get_project(project_id)
376			backend_params = parse_backend_params(backend_param, project)
377
378			import annif.eval
379
380			eval_batch = annif.eval.EvaluationBatch(project.subjects)
381
382			if results_file:
383			try:
384			print("", end="", file=results_file)
385			click.echo(
386			"Writing per subject evaluation results to {!s}".format(
387			results_file.name
388			)
389			)
390			except Exception as e:
391			raise NotSupportedException(
392			"cannot open results-file for writing: " + str(e)
393			)
394			corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
395			jobs, pool_class = annif.parallel.get_pool(jobs)
396
397			project.initialize(parallel=True)
398			psmap = annif.parallel.ProjectSuggestMap(
399			project.registry, [project_id], backend_params, limit, threshold
400			)
401
402			with pool_class(jobs) as pool:
403			for hit_sets, subject_sets in pool.imap_unordered(
404			psmap.suggest_batch, corpus.doc_batches
405			):
406			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
407
408			template = "{0:<30}\t{1}"
409			metrics = eval_batch.results(
410			metrics=metric, results_file=results_file, language=project.vocab_lang
411			)
412			for metric, score in metrics.items():
413			click.echo(template.format(metric + ":", score))
414			if metrics_file:
415			json.dump(
416			{metric_code(mname): val for mname, val in metrics.items()},
417			metrics_file,
418			indent=2,
419			)
420
421
422			FILTER_BATCH_MAX_LIMIT = 15
423
424
425			@cli.command("optimize")
426			@click.argument("project_id")
427			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
428			@docs_limit_option
429			@backend_param_option
430			@common_options
431			def run_optimize(project_id, paths, docs_limit, backend_param):
432			"""
433			Suggest subjects for documents, testing multiple limits and thresholds.
434			\f
435			This command will use different limit (maximum number of subjects) and
436			score threshold values when assigning subjects to each document given by
437			``PATHS`` and compare the results against the gold standard subjects in the
438			documents. The output is a list of parameter combinations and their scores.
439			From the output, you can determine the optimum limit and threshold
440			parameters depending on which measure you want to target.
441			"""
442			project = get_project(project_id)
443			backend_params = parse_backend_params(backend_param, project)
444
445			filter_batches = generate_filter_batches(project.subjects, FILTER_BATCH_MAX_LIMIT)
446
447			ndocs = 0
448			corpus = open_documents(paths, project.subjects, project.vocab_lang, docs_limit)
449			for docs_batch in corpus.doc_batches:
450			texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in docs_batch])
451			raw_hit_sets = project.suggest(texts, backend_params)
452			hit_sets = [
453			raw_hits.filter(project.subjects, limit=FILTER_BATCH_MAX_LIMIT)
454			for raw_hits in raw_hit_sets
455			]
456			assert isinstance(hit_sets[0], ListSuggestionResult), (
457			"Optimize should only be done with ListSuggestionResult "
458			+ "as it would be very slow with VectorSuggestionResult."
459			)
460			for hit_filter, filter_batch in filter_batches.values():
461			filtered_hits = [hit_filter(hits) for hits in hit_sets]
462			filter_batch.evaluate_many(filtered_hits, subject_sets)
463			ndocs += len(texts)
464
465			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
466
467			best_scores = collections.defaultdict(float)
468			best_params = {}
469
470			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
471			# Store the batches in a list that gets consumed along the way
472			# This way GC will have a chance to reclaim the memory
473			filter_batches = list(filter_batches.items())
474			while filter_batches:
475			params, filter_batch = filter_batches.pop(0)
476			metrics = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
477			results = filter_batch[1].results(metrics=metrics)
478			for metric, score in results.items():
479			if score >= best_scores[metric]:
480			best_scores[metric] = score
481			best_params[metric] = params
482			click.echo(
483			template.format(
484			params[0],
485			params[1],
486			results["Precision (doc avg)"],
487			results["Recall (doc avg)"],
488			results["F1 score (doc avg)"],
489			)
490			)
491
492			click.echo()
493			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
494			for metric in metrics:
			0 ignored issues – show introduced 2020-05-20 09:55 UTC by Report Bug Copy Issue Report The variable `metrics` does not seem to be defined in case the `while` loop on line `474` is not entered. Are you sure this can never be the case? Loading history...
495			click.echo(
496			template2.format(
497			metric,
498			best_scores[metric],
499			best_params[metric][0],
500			best_params[metric][1],
501			)
502			)
503			click.echo("Documents evaluated:\t{}".format(ndocs))
504
505
506			@cli.command("hyperopt")
507			@click.argument("project_id")
508			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
509			@click.option("--trials", "-T", default=10, help="Number of trials")
510			@click.option(
511			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
512			)
513			@click.option(
514			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
515			)
516			@click.option(
517			"--results-file",
518			"-r",
519			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
520			help="""Specify file path to write trial results as CSV.
521			File directory must exist, existing file will be overwritten.""",
522			)
523			@docs_limit_option
524			@common_options
525			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
526			"""
527			Optimize the hyperparameters of a project using validation documents from
528			``PATHS``. Not supported by all backends. Output is a list of trial results
529			and a report of the best performing parameters.
530			"""
531			proj = get_project(project_id)
532			documents = open_documents(paths, proj.subjects, proj.vocab_lang, docs_limit)
533			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
534			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
535			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
536			click.echo("---")
537			for line in rec.lines:
538			click.echo(line)
539			click.echo("---")
540
541
542			if __name__ == "__main__":
543			cli()
544

NatLibFi / Annif

Pull Request — master (#675)

annif.cli.run_optimize() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like