annif.cli - Code Metrics - Inspection of "reimplement CLI optimize command using SuggestionR..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — issue678-refactor-suggestionre... ( 1844ad...818ba2 )

by Osma

created 2023-03-17 15:29 UTC

annif.cli B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	555
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	367
dl	0
loc	555
rs	8.8798
c	0
b	0
f	0
wmc	44

12 Functions

Rating	Name	Size	Complexity
A	run_train()	42	3
A	run_load_vocab()	36	4
A	run_learn()	20	1
A	run_list_vocabs()	22	3
A	run_list_projects()	22	2
A	run_clear_project()	9	1
A	run_show_project()	17	1
B	run_suggest()	42	6
B	run_index()	45	6
C	run_eval()	104	7
A	run_hyperopt()	36	2
C	run_optimize()	94	8

How to fix Complexity

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.suggestion import SuggestionResults
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format("Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(min_access=Access.private).values():
        click.echo(
            template.format(
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
            )
        )


@cli.command("show-project")
@click.argument("project_id")
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {proj.modification_time}")


@cli.command("clear")
@click.argument("project_id")
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
    click.echo(header)
    click.echo("-" * len(header))
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))


@cli.command("load-vocab")
@click.argument("vocab_id")
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@click.argument("project_id")
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@click.argument("project_id")
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(
        directory, None, None, require_subjects=False
    )
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@click.argument("project_id")
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import json
7			import os.path
8			import re
9			import sys
10
11			import click
12			import click_log
13			from flask.cli import FlaskGroup
14
15			import annif
16			import annif.corpus
17			import annif.parallel
18			import annif.project
19			import annif.registry
20			from annif import cli_util
21			from annif.exception import NotInitializedException, NotSupportedException
22			from annif.project import Access
23			from annif.suggestion import SuggestionResults
24			from annif.util import metric_code
25
26			logger = annif.logger
27			click_log.basic_config(logger)
28
29			cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
30			cli = click.version_option(message="%(version)s")(cli)
31
32
33			@cli.command("list-projects")
34			@cli_util.common_options
35			@click_log.simple_verbosity_option(logger, default="ERROR")
36			def run_list_projects():
37			"""
38			List available projects.
39			\f
40			Show a list of currently defined projects. Projects are defined in a
41			configuration file, normally called ``projects.cfg``. See `Project
42			configuration
43			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
44			for details.
45			"""
46
47			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
48			header = template.format("Project ID", "Project Name", "Language", "Trained")
49			click.echo(header)
50			click.echo("-" * len(header))
51			for proj in annif.registry.get_projects(min_access=Access.private).values():
52			click.echo(
53			template.format(
54			proj.project_id, proj.name, proj.language, str(proj.is_trained)
55			)
56			)
57
58
59			@cli.command("show-project")
60			@click.argument("project_id")
61			@cli_util.common_options
62			def run_show_project(project_id):
63			"""
64			Show information about a project.
65			"""
66
67			proj = cli_util.get_project(project_id)
68			click.echo(f"Project ID: {proj.project_id}")
69			click.echo(f"Project Name: {proj.name}")
70			click.echo(f"Language: {proj.language}")
71			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
72			click.echo(f"Vocab language: {proj.vocab_lang}")
73			click.echo(f"Access: {proj.access.name}")
74			click.echo(f"Trained: {proj.is_trained}")
75			click.echo(f"Modification time: {proj.modification_time}")
76
77
78			@cli.command("clear")
79			@click.argument("project_id")
80			@cli_util.common_options
81			def run_clear_project(project_id):
82			"""
83			Initialize the project to its original, untrained state.
84			"""
85			proj = cli_util.get_project(project_id)
86			proj.remove_model_data()
87
88
89			@cli.command("list-vocabs")
90			@cli_util.common_options
91			@click_log.simple_verbosity_option(logger, default="ERROR")
92			def run_list_vocabs():
93			"""
94			List available vocabularies.
95			"""
96
97			template = "{0: <20}{1: <20}{2: >10} {3: <6}"
98			header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
99			click.echo(header)
100			click.echo("-" * len(header))
101			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
102			try:
103			languages = ",".join(sorted(vocab.languages))
104			size = len(vocab)
105			loaded = True
106			except NotInitializedException:
107			languages = "-"
108			size = "-"
109			loaded = False
110			click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
111
112
113			@cli.command("load-vocab")
114			@click.argument("vocab_id")
115			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
116			@click.option("--language", "-L", help="Language of subject file")
117			@click.option(
118			"--force",
119			"-f",
120			default=False,
121			is_flag=True,
122			help="Replace existing vocabulary completely instead of updating it",
123			)
124			@cli_util.common_options
125			def run_load_vocab(vocab_id, language, force, subjectfile):
126			"""
127			Load a vocabulary from a subject file.
128			"""
129			vocab = cli_util.get_vocab(vocab_id)
130			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
131			# SKOS/RDF file supported by rdflib
132			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
133			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
134			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
135			# CSV file
136			subjects = annif.corpus.SubjectFileCSV(subjectfile)
137			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
138			else:
139			# probably a TSV file - we need to know its language
140			if not language:
141			click.echo(
142			"Please use --language option to set the language of a TSV vocabulary.",
143			err=True,
144			)
145			sys.exit(1)
146			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
147			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
148			vocab.load_vocabulary(subjects, force=force)
149
150
151			@cli.command("train")
152			@click.argument("project_id")
153			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
154			@click.option(
155			"--cached/--no-cached",
156			"-c/-C",
157			default=False,
158			help="Reuse preprocessed training data from previous run",
159			)
160			@click.option(
161			"--jobs",
162			"-j",
163			default=0,
164			help="Number of parallel jobs (0 means choose automatically)",
165			)
166			@cli_util.docs_limit_option
167			@cli_util.backend_param_option
168			@cli_util.common_options
169			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
170			"""
171			Train a project on a collection of documents.
172			\f
173			This will train the project using the documents from ``PATHS`` (directories
174			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
175			is set, preprocessed training data from the previous run is reused instead
176			of documents input; see `Reusing preprocessed training data
177			<https://github.com/NatLibFi/Annif/wiki/
178			Reusing-preprocessed-training-data>`_.
179			"""
180			proj = cli_util.get_project(project_id)
181			backend_params = cli_util.parse_backend_params(backend_param, proj)
182			if cached:
183			if len(paths) > 0:
184			raise click.UsageError(
185			"Corpus paths cannot be given when using --cached option."
186			)
187			documents = "cached"
188			else:
189			documents = cli_util.open_documents(
190			paths, proj.subjects, proj.vocab_lang, docs_limit
191			)
192			proj.train(documents, backend_params, jobs)
193
194
195			@cli.command("learn")
196			@click.argument("project_id")
197			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
198			@cli_util.docs_limit_option
199			@cli_util.backend_param_option
200			@cli_util.common_options
201			def run_learn(project_id, paths, docs_limit, backend_param):
202			"""
203			Further train an existing project on a collection of documents.
204			\f
205			Similar to the ``train`` command. This will continue training an already
206			trained project using the documents given by ``PATHS`` in a single batch
207			operation. Not supported by all backends.
208			"""
209			proj = cli_util.get_project(project_id)
210			backend_params = cli_util.parse_backend_params(backend_param, proj)
211			documents = cli_util.open_documents(
212			paths, proj.subjects, proj.vocab_lang, docs_limit
213			)
214			proj.learn(documents, backend_params)
215
216
217			@cli.command("suggest")
218			@click.argument("project_id")
219			@click.argument(
220			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
221			)
222			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
223			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
224			@click.option("--language", "-L", help="Language of subject labels")
225			@cli_util.docs_limit_option
226			@cli_util.backend_param_option
227			@cli_util.common_options
228			def run_suggest(
229			project_id, paths, limit, threshold, language, backend_param, docs_limit
230			):
231			"""
232			Suggest subjects for a single document from standard input or for one or more
233			document file(s) given its/their path(s).
234			\f
235			This will read a text document from standard input and suggest subjects for
236			it, or if given path(s) to file(s), suggest subjects for it/them.
237			"""
238			project = cli_util.get_project(project_id)
239			lang = language or project.vocab_lang
240			if lang not in project.vocab.languages:
241			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
242			backend_params = cli_util.parse_backend_params(backend_param, project)
243
244			if paths and not (len(paths) == 1 and paths[0] == "-"):
245			docs = cli_util.open_text_documents(paths, docs_limit)
246			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
247			for (
248			suggestions,
249			path,
250			) in zip(results, paths):
251			click.echo(f"Suggestions for {path}")
252			cli_util.show_hits(suggestions, project, lang)
253			else:
254			text = sys.stdin.read()
255			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
256			0
257			]
258			cli_util.show_hits(suggestions, project, lang)
259
260
261			@cli.command("index")
262			@click.argument("project_id")
263			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
264			@click.option(
265			"--suffix", "-s", default=".annif", help="File name suffix for result files"
266			)
267			@click.option(
268			"--force/--no-force",
269			"-f/-F",
270			default=False,
271			help="Force overwriting of existing result files",
272			)
273			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
274			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
275			@click.option("--language", "-L", help="Language of subject labels")
276			@cli_util.backend_param_option
277			@cli_util.common_options
278			def run_index(
279			project_id, directory, suffix, force, limit, threshold, language, backend_param
280			):
281			"""
282			Index a directory with documents, suggesting subjects for each document.
283			Write the results in TSV files with the given suffix (``.annif`` by
284			default).
285			"""
286			project = cli_util.get_project(project_id)
287			lang = language or project.vocab_lang
288			if lang not in project.vocab.languages:
289			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
290			backend_params = cli_util.parse_backend_params(backend_param, project)
291
292			documents = annif.corpus.DocumentDirectory(
293			directory, None, None, require_subjects=False
294			)
295			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
296
297			for (docfilename, _), suggestions in zip(documents, results):
298			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
299			if os.path.exists(subjectfilename) and not force:
300			click.echo(
301			"Not overwriting {} (use --force to override)".format(subjectfilename)
302			)
303			continue
304			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
305			cli_util.show_hits(suggestions, project, lang, file=subjfile)
306
307
308			@cli.command("eval")
309			@click.argument("project_id")
310			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
311			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
312			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
313			@click.option(
314			"--metric",
315			"-m",
316			default=[],
317			multiple=True,
318			help="Metric to calculate (default: all)",
319			)
320			@click.option(
321			"--metrics-file",
322			"-M",
323			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
324			help="""Specify file in order to write evaluation metrics in JSON format.
325			File directory must exist, existing file will be overwritten.""",
326			)
327			@click.option(
328			"--results-file",
329			"-r",
330			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
331			help="""Specify file in order to write non-aggregated results per subject.
332			File directory must exist, existing file will be overwritten.""",
333			)
334			@click.option(
335			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
336			)
337			@cli_util.docs_limit_option
338			@cli_util.backend_param_option
339			@cli_util.common_options
340			def run_eval(
341			project_id,
342			paths,
343			limit,
344			threshold,
345			docs_limit,
346			metric,
347			metrics_file,
348			results_file,
349			jobs,
350			backend_param,
351			):
352			"""
353			Suggest subjects for documents and evaluate the results by comparing
354			against a gold standard.
355			\f
356			With this command the documents from ``PATHS`` (directories or possibly
357			gzipped TSV files) will be assigned subject suggestions and then
358			statistical measures are calculated that quantify how well the suggested
359			subjects match the gold-standard subjects in the documents.
360
361			Normally the output is the list of the metrics calculated across documents.
362			If ``--results-file <FILENAME>`` option is given, the metrics are
363			calculated separately for each subject, and written to the given file.
364			"""
365
366			project = cli_util.get_project(project_id)
367			backend_params = cli_util.parse_backend_params(backend_param, project)
368
369			import annif.eval
370
371			eval_batch = annif.eval.EvaluationBatch(project.subjects)
372
373			if results_file:
374			try:
375			print("", end="", file=results_file)
376			click.echo(
377			"Writing per subject evaluation results to {!s}".format(
378			results_file.name
379			)
380			)
381			except Exception as e:
382			raise NotSupportedException(
383			"cannot open results-file for writing: " + str(e)
384			)
385			corpus = cli_util.open_documents(
386			paths, project.subjects, project.vocab_lang, docs_limit
387			)
388			jobs, pool_class = annif.parallel.get_pool(jobs)
389
390			project.initialize(parallel=True)
391			psmap = annif.parallel.ProjectSuggestMap(
392			project.registry, [project_id], backend_params, limit, threshold
393			)
394
395			with pool_class(jobs) as pool:
396			for hit_sets, subject_sets in pool.imap_unordered(
397			psmap.suggest_batch, corpus.doc_batches
398			):
399			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
400
401			template = "{0:<30}\t{1}"
402			metrics = eval_batch.results(
403			metrics=metric, results_file=results_file, language=project.vocab_lang
404			)
405			for metric, score in metrics.items():
406			click.echo(template.format(metric + ":", score))
407			if metrics_file:
408			json.dump(
409			{metric_code(mname): val for mname, val in metrics.items()},
410			metrics_file,
411			indent=2,
412			)
413
414
415			FILTER_BATCH_MAX_LIMIT = 15
416			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
417
418
419			@cli.command("optimize")
420			@click.argument("project_id")
421			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
422			@click.option(
423			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
424			)
425			@cli_util.docs_limit_option
426			@cli_util.backend_param_option
427			@cli_util.common_options
428			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
429			"""
430			Suggest subjects for documents, testing multiple limits and thresholds.
431			\f
432			This command will use different limit (maximum number of subjects) and
433			score threshold values when assigning subjects to each document given by
434			``PATHS`` and compare the results against the gold standard subjects in the
435			documents. The output is a list of parameter combinations and their scores.
436			From the output, you can determine the optimum limit and threshold
437			parameters depending on which measure you want to target.
438			"""
439			project = cli_util.get_project(project_id)
440			backend_params = cli_util.parse_backend_params(backend_param, project)
441			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
442
443			import annif.eval
444
445			corpus = cli_util.open_documents(
446			paths, project.subjects, project.vocab_lang, docs_limit
447			)
448
449			jobs, pool_class = annif.parallel.get_pool(jobs)
450
451			project.initialize(parallel=True)
452			psmap = annif.parallel.ProjectSuggestMap(
453			project.registry,
454			[project_id],
455			backend_params,
456			limit=FILTER_BATCH_MAX_LIMIT,
457			threshold=0.0,
458			)
459
460			ndocs = 0
461			suggestion_batches = []
462			subject_set_batches = []
463			with pool_class(jobs) as pool:
464			for suggestion_batch, subject_sets in pool.imap_unordered(
465			psmap.suggest_batch, corpus.doc_batches
466			):
467			ndocs += len(suggestion_batch[project_id])
468			suggestion_batches.append(suggestion_batch[project_id])
469			subject_set_batches.append(subject_sets)
470
471			orig_suggestion_results = SuggestionResults(suggestion_batches)
472
473			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
474
475			best_scores = collections.defaultdict(float)
476			best_params = {}
477
478			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
479			import annif.eval
480
481			for limit, threshold in filter_params:
482			eval_batch = annif.eval.EvaluationBatch(project.subjects)
483			filtered_results = orig_suggestion_results.filter(limit, threshold)
484			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
485			eval_batch.evaluate_many(batch, subject_sets)
486			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
487			for metric, score in results.items():
488			if score >= best_scores[metric]:
489			best_scores[metric] = score
490			best_params[metric] = (limit, threshold)
491			click.echo(
492			template.format(
493			limit,
494			threshold,
495			results["Precision (doc avg)"],
496			results["Recall (doc avg)"],
497			results["F1 score (doc avg)"],
498			)
499			)
500
501			click.echo()
502			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
503			for metric in OPTIMIZE_METRICS:
504			click.echo(
505			template2.format(
506			metric,
507			best_scores[metric],
508			best_params[metric][0],
509			best_params[metric][1],
510			)
511			)
512			click.echo("Documents evaluated:\t{}".format(ndocs))
513
514
515			@cli.command("hyperopt")
516			@click.argument("project_id")
517			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
518			@click.option("--trials", "-T", default=10, help="Number of trials")
519			@click.option(
520			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
521			)
522			@click.option(
523			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
524			)
525			@click.option(
526			"--results-file",
527			"-r",
528			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
529			help="""Specify file path to write trial results as CSV.
530			File directory must exist, existing file will be overwritten.""",
531			)
532			@cli_util.docs_limit_option
533			@cli_util.common_options
534			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
535			"""
536			Optimize the hyperparameters of a project using validation documents from
537			``PATHS``. Not supported by all backends. Output is a list of trial results
538			and a report of the best performing parameters.
539			"""
540			proj = cli_util.get_project(project_id)
541			documents = cli_util.open_documents(
542			paths, proj.subjects, proj.vocab_lang, docs_limit
543			)
544			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
545			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
546			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
547			click.echo("---")
548			for line in rec.lines:
549			click.echo(line)
550			click.echo("---")
551
552
553			if __name__ == "__main__":
554			cli()
555

NatLibFi / Annif

Push — issue678-refactor-suggestionre... ( 1844ad...818ba2 )

annif.cli B

Complexity

Size/Duplication

Importance

12 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like