annif.cli.run_completion() - Code Metrics - Inspection of "Merge branch 'main' into upgrade-to-connexion3" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — upgrade-to-connexion3 ( e417e0...5d7ec9 )

by Juho

created 2023-11-15 14:58 UTC

annif.cli.run_completion() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	16
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	10
nop	1
dl	0
loc	16
rs	9.9
c	0
b	0
f	0

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import importlib
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)


if len(sys.argv) > 1 and sys.argv[1] in ("run", "routes"):
    create_app = annif.create_app  # Use Flask with Connexion
else:
    # Connexion is not needed for most CLI commands, use plain Flask
    create_app = annif.create_flask_app

cli = FlaskGroup(create_app=create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    column_headings = (
        "Project ID",
        "Project Name",
        "Vocabulary ID",
        "Language",
        "Trained",
        "Modification time",
    )
    table = [
        (
            proj.project_id,
            proj.name,
            proj.vocab.vocab_id if proj.vocab_spec else "-",
            proj.language,
            str(proj.is_trained),
            cli_util.format_datetime(proj.modification_time),
        )
        for proj in annif.registry.get_projects(min_access=Access.private).values()
    ]
    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("show-project")
@cli_util.project_id
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Backend:           {proj.backend.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")


@cli.command("clear")
@cli_util.project_id
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
    table = []
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        row = (vocab.vocab_id, languages, str(size), str(loaded))
        table.append(row)

    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("load-vocab")
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@cli_util.project_id
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@cli_util.project_id
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1:{fmt_spec}}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        if isinstance(score, int):
            fmt_spec = "d"
        elif isinstance(score, float):
            fmt_spec = ".04f"
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))

    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    from annif.suggestion import SuggestionResults

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
@click.option("--fish", "shell", flag_value="fish")
def run_completion(shell):
    """Generate the script for tab-key autocompletion for the given shell. To enable the
    completion support in your current bash terminal session run\n
        source <(annif completion --bash)
    """

    if shell is None:
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")

    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
    click.echo(script)


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import importlib
7			import json
8			import os.path
9			import re
10			import sys
11
12			import click
13			import click_log
14			from flask.cli import FlaskGroup
15
16			import annif
17			import annif.corpus
18			import annif.parallel
19			import annif.project
20			import annif.registry
21			from annif import cli_util
22			from annif.exception import NotInitializedException, NotSupportedException
23			from annif.project import Access
24			from annif.util import metric_code
25
26			logger = annif.logger
27			click_log.basic_config(logger)
28
29
30			if len(sys.argv) > 1 and sys.argv[1] in ("run", "routes"):
31			create_app = annif.create_app # Use Flask with Connexion
32			else:
33			# Connexion is not needed for most CLI commands, use plain Flask
34			create_app = annif.create_flask_app
35
36			cli = FlaskGroup(create_app=create_app, add_version_option=False)
37			cli = click.version_option(message="%(version)s")(cli)
38
39
40			@cli.command("list-projects")
41			@cli_util.common_options
42			@click_log.simple_verbosity_option(logger, default="ERROR")
43			def run_list_projects():
44			"""
45			List available projects.
46			\f
47			Show a list of currently defined projects. Projects are defined in a
48			configuration file, normally called ``projects.cfg``. See `Project
49			configuration
50			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51			for details.
52			"""
53
54			column_headings = (
55			"Project ID",
56			"Project Name",
57			"Vocabulary ID",
58			"Language",
59			"Trained",
60			"Modification time",
61			)
62			table = [
63			(
64			proj.project_id,
65			proj.name,
66			proj.vocab.vocab_id if proj.vocab_spec else "-",
67			proj.language,
68			str(proj.is_trained),
69			cli_util.format_datetime(proj.modification_time),
70			)
71			for proj in annif.registry.get_projects(min_access=Access.private).values()
72			]
73			template = cli_util.make_list_template(column_headings, *table)
74			header = template.format(*column_headings)
75			click.echo(header)
76			click.echo("-" * len(header))
77			for row in table:
78			click.echo(template.format(*row))
79
80
81			@cli.command("show-project")
82			@cli_util.project_id
83			@cli_util.common_options
84			def run_show_project(project_id):
85			"""
86			Show information about a project.
87			"""
88
89			proj = cli_util.get_project(project_id)
90			click.echo(f"Project ID: {proj.project_id}")
91			click.echo(f"Project Name: {proj.name}")
92			click.echo(f"Language: {proj.language}")
93			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
94			click.echo(f"Vocab language: {proj.vocab_lang}")
95			click.echo(f"Access: {proj.access.name}")
96			click.echo(f"Backend: {proj.backend.name}")
97			click.echo(f"Trained: {proj.is_trained}")
98			click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
99
100
101			@cli.command("clear")
102			@cli_util.project_id
103			@cli_util.common_options
104			def run_clear_project(project_id):
105			"""
106			Initialize the project to its original, untrained state.
107			"""
108			proj = cli_util.get_project(project_id)
109			proj.remove_model_data()
110
111
112			@cli.command("list-vocabs")
113			@cli_util.common_options
114			@click_log.simple_verbosity_option(logger, default="ERROR")
115			def run_list_vocabs():
116			"""
117			List available vocabularies.
118			"""
119
120			column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
121			table = []
122			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
123			try:
124			languages = ",".join(sorted(vocab.languages))
125			size = len(vocab)
126			loaded = True
127			except NotInitializedException:
128			languages = "-"
129			size = "-"
130			loaded = False
131			row = (vocab.vocab_id, languages, str(size), str(loaded))
132			table.append(row)
133
134			template = cli_util.make_list_template(column_headings, *table)
135			header = template.format(*column_headings)
136			click.echo(header)
137			click.echo("-" * len(header))
138			for row in table:
139			click.echo(template.format(*row))
140
141
142			@cli.command("load-vocab")
143			@click.argument("vocab_id", shell_complete=cli_util.complete_param)
144			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
145			@click.option("--language", "-L", help="Language of subject file")
146			@click.option(
147			"--force",
148			"-f",
149			default=False,
150			is_flag=True,
151			help="Replace existing vocabulary completely instead of updating it",
152			)
153			@cli_util.common_options
154			def run_load_vocab(vocab_id, language, force, subjectfile):
155			"""
156			Load a vocabulary from a subject file.
157			"""
158			vocab = cli_util.get_vocab(vocab_id)
159			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
160			# SKOS/RDF file supported by rdflib
161			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
162			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
163			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
164			# CSV file
165			subjects = annif.corpus.SubjectFileCSV(subjectfile)
166			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
167			else:
168			# probably a TSV file - we need to know its language
169			if not language:
170			click.echo(
171			"Please use --language option to set the language of a TSV vocabulary.",
172			err=True,
173			)
174			sys.exit(1)
175			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
176			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
177			vocab.load_vocabulary(subjects, force=force)
178
179
180			@cli.command("train")
181			@cli_util.project_id
182			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
183			@click.option(
184			"--cached/--no-cached",
185			"-c/-C",
186			default=False,
187			help="Reuse preprocessed training data from previous run",
188			)
189			@click.option(
190			"--jobs",
191			"-j",
192			default=0,
193			help="Number of parallel jobs (0 means choose automatically)",
194			)
195			@cli_util.docs_limit_option
196			@cli_util.backend_param_option
197			@cli_util.common_options
198			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
199			"""
200			Train a project on a collection of documents.
201			\f
202			This will train the project using the documents from ``PATHS`` (directories
203			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
204			is set, preprocessed training data from the previous run is reused instead
205			of documents input; see `Reusing preprocessed training data
206			<https://github.com/NatLibFi/Annif/wiki/
207			Reusing-preprocessed-training-data>`_.
208			"""
209			proj = cli_util.get_project(project_id)
210			backend_params = cli_util.parse_backend_params(backend_param, proj)
211			if cached:
212			if len(paths) > 0:
213			raise click.UsageError(
214			"Corpus paths cannot be given when using --cached option."
215			)
216			documents = "cached"
217			else:
218			documents = cli_util.open_documents(
219			paths, proj.subjects, proj.vocab_lang, docs_limit
220			)
221			proj.train(documents, backend_params, jobs)
222
223
224			@cli.command("learn")
225			@cli_util.project_id
226			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
227			@cli_util.docs_limit_option
228			@cli_util.backend_param_option
229			@cli_util.common_options
230			def run_learn(project_id, paths, docs_limit, backend_param):
231			"""
232			Further train an existing project on a collection of documents.
233			\f
234			Similar to the ``train`` command. This will continue training an already
235			trained project using the documents given by ``PATHS`` in a single batch
236			operation. Not supported by all backends.
237			"""
238			proj = cli_util.get_project(project_id)
239			backend_params = cli_util.parse_backend_params(backend_param, proj)
240			documents = cli_util.open_documents(
241			paths, proj.subjects, proj.vocab_lang, docs_limit
242			)
243			proj.learn(documents, backend_params)
244
245
246			@cli.command("suggest")
247			@cli_util.project_id
248			@click.argument(
249			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
250			)
251			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
252			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
253			@click.option("--language", "-L", help="Language of subject labels")
254			@cli_util.docs_limit_option
255			@cli_util.backend_param_option
256			@cli_util.common_options
257			def run_suggest(
258			project_id, paths, limit, threshold, language, backend_param, docs_limit
259			):
260			"""
261			Suggest subjects for a single document from standard input or for one or more
262			document file(s) given its/their path(s).
263			\f
264			This will read a text document from standard input and suggest subjects for
265			it, or if given path(s) to file(s), suggest subjects for it/them.
266			"""
267			project = cli_util.get_project(project_id)
268			lang = language or project.vocab_lang
269			if lang not in project.vocab.languages:
270			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
271			backend_params = cli_util.parse_backend_params(backend_param, project)
272
273			if paths and not (len(paths) == 1 and paths[0] == "-"):
274			docs = cli_util.open_text_documents(paths, docs_limit)
275			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
276			for (
277			suggestions,
278			path,
279			) in zip(results, paths):
280			click.echo(f"Suggestions for {path}")
281			cli_util.show_hits(suggestions, project, lang)
282			else:
283			text = sys.stdin.read()
284			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
285			0
286			]
287			cli_util.show_hits(suggestions, project, lang)
288
289
290			@cli.command("index")
291			@cli_util.project_id
292			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
293			@click.option(
294			"--suffix", "-s", default=".annif", help="File name suffix for result files"
295			)
296			@click.option(
297			"--force/--no-force",
298			"-f/-F",
299			default=False,
300			help="Force overwriting of existing result files",
301			)
302			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
303			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
304			@click.option("--language", "-L", help="Language of subject labels")
305			@cli_util.backend_param_option
306			@cli_util.common_options
307			def run_index(
308			project_id, directory, suffix, force, limit, threshold, language, backend_param
309			):
310			"""
311			Index a directory with documents, suggesting subjects for each document.
312			Write the results in TSV files with the given suffix (``.annif`` by
313			default).
314			"""
315			project = cli_util.get_project(project_id)
316			lang = language or project.vocab_lang
317			if lang not in project.vocab.languages:
318			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
319			backend_params = cli_util.parse_backend_params(backend_param, project)
320
321			documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
322			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
323
324			for (docfilename, _), suggestions in zip(documents, results):
325			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
326			if os.path.exists(subjectfilename) and not force:
327			click.echo(
328			"Not overwriting {} (use --force to override)".format(subjectfilename)
329			)
330			continue
331			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
332			cli_util.show_hits(suggestions, project, lang, file=subjfile)
333
334
335			@cli.command("eval")
336			@cli_util.project_id
337			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
338			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
339			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
340			@click.option(
341			"--metric",
342			"-m",
343			default=[],
344			multiple=True,
345			help="Metric to calculate (default: all)",
346			)
347			@click.option(
348			"--metrics-file",
349			"-M",
350			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
351			help="""Specify file in order to write evaluation metrics in JSON format.
352			File directory must exist, existing file will be overwritten.""",
353			)
354			@click.option(
355			"--results-file",
356			"-r",
357			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
358			help="""Specify file in order to write non-aggregated results per subject.
359			File directory must exist, existing file will be overwritten.""",
360			)
361			@click.option(
362			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
363			)
364			@cli_util.docs_limit_option
365			@cli_util.backend_param_option
366			@cli_util.common_options
367			def run_eval(
368			project_id,
369			paths,
370			limit,
371			threshold,
372			docs_limit,
373			metric,
374			metrics_file,
375			results_file,
376			jobs,
377			backend_param,
378			):
379			"""
380			Suggest subjects for documents and evaluate the results by comparing
381			against a gold standard.
382			\f
383			With this command the documents from ``PATHS`` (directories or possibly
384			gzipped TSV files) will be assigned subject suggestions and then
385			statistical measures are calculated that quantify how well the suggested
386			subjects match the gold-standard subjects in the documents.
387
388			Normally the output is the list of the metrics calculated across documents.
389			If ``--results-file <FILENAME>`` option is given, the metrics are
390			calculated separately for each subject, and written to the given file.
391			"""
392
393			project = cli_util.get_project(project_id)
394			backend_params = cli_util.parse_backend_params(backend_param, project)
395
396			import annif.eval
397
398			eval_batch = annif.eval.EvaluationBatch(project.subjects)
399
400			if results_file:
401			try:
402			print("", end="", file=results_file)
403			click.echo(
404			"Writing per subject evaluation results to {!s}".format(
405			results_file.name
406			)
407			)
408			except Exception as e:
409			raise NotSupportedException(
410			"cannot open results-file for writing: " + str(e)
411			)
412			corpus = cli_util.open_documents(
413			paths, project.subjects, project.vocab_lang, docs_limit
414			)
415			jobs, pool_class = annif.parallel.get_pool(jobs)
416
417			project.initialize(parallel=True)
418			psmap = annif.parallel.ProjectSuggestMap(
419			project.registry, [project_id], backend_params, limit, threshold
420			)
421
422			with pool_class(jobs) as pool:
423			for hit_sets, subject_sets in pool.imap_unordered(
424			psmap.suggest_batch, corpus.doc_batches
425			):
426			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
427
428			template = "{0:<30}\t{1:{fmt_spec}}"
429			metrics = eval_batch.results(
430			metrics=metric, results_file=results_file, language=project.vocab_lang
431			)
432			for metric, score in metrics.items():
433			if isinstance(score, int):
434			fmt_spec = "d"
435			elif isinstance(score, float):
436			fmt_spec = ".04f"
437			click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
			0 ignored issues – show introduced 2023-05-12 11:10 UTC by Report Bug Copy Issue Report The variable `fmt_spec` does not seem to be defined for all execution paths. Loading history...
438			if metrics_file:
439			json.dump(
440			{metric_code(mname): val for mname, val in metrics.items()},
441			metrics_file,
442			indent=2,
443			)
444
445
446			FILTER_BATCH_MAX_LIMIT = 15
447			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
448
449
450			@cli.command("optimize")
451			@cli_util.project_id
452			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
453			@click.option(
454			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
455			)
456			@cli_util.docs_limit_option
457			@cli_util.backend_param_option
458			@cli_util.common_options
459			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
460			"""
461			Suggest subjects for documents, testing multiple limits and thresholds.
462			\f
463			This command will use different limit (maximum number of subjects) and
464			score threshold values when assigning subjects to each document given by
465			``PATHS`` and compare the results against the gold standard subjects in the
466			documents. The output is a list of parameter combinations and their scores.
467			From the output, you can determine the optimum limit and threshold
468			parameters depending on which measure you want to target.
469			"""
470			project = cli_util.get_project(project_id)
471			backend_params = cli_util.parse_backend_params(backend_param, project)
472			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
473
474			import annif.eval
475
476			corpus = cli_util.open_documents(
477			paths, project.subjects, project.vocab_lang, docs_limit
478			)
479
480			jobs, pool_class = annif.parallel.get_pool(jobs)
481
482			project.initialize(parallel=True)
483			psmap = annif.parallel.ProjectSuggestMap(
484			project.registry,
485			[project_id],
486			backend_params,
487			limit=FILTER_BATCH_MAX_LIMIT,
488			threshold=0.0,
489			)
490
491			ndocs = 0
492			suggestion_batches = []
493			subject_set_batches = []
494			with pool_class(jobs) as pool:
495			for suggestion_batch, subject_sets in pool.imap_unordered(
496			psmap.suggest_batch, corpus.doc_batches
497			):
498			ndocs += len(suggestion_batch[project_id])
499			suggestion_batches.append(suggestion_batch[project_id])
500			subject_set_batches.append(subject_sets)
501
502			from annif.suggestion import SuggestionResults
503
504			orig_suggestion_results = SuggestionResults(suggestion_batches)
505
506			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
507
508			best_scores = collections.defaultdict(float)
509			best_params = {}
510
511			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
512			import annif.eval
513
514			for limit, threshold in filter_params:
515			eval_batch = annif.eval.EvaluationBatch(project.subjects)
516			filtered_results = orig_suggestion_results.filter(limit, threshold)
517			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
518			eval_batch.evaluate_many(batch, subject_sets)
519			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
520			for metric, score in results.items():
521			if score >= best_scores[metric]:
522			best_scores[metric] = score
523			best_params[metric] = (limit, threshold)
524			click.echo(
525			template.format(
526			limit,
527			threshold,
528			results["Precision (doc avg)"],
529			results["Recall (doc avg)"],
530			results["F1 score (doc avg)"],
531			)
532			)
533
534			click.echo()
535			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
536			for metric in OPTIMIZE_METRICS:
537			click.echo(
538			template2.format(
539			metric,
540			best_scores[metric],
541			best_params[metric][0],
542			best_params[metric][1],
543			)
544			)
545			click.echo("Documents evaluated:\t{}".format(ndocs))
546
547
548			@cli.command("hyperopt")
549			@cli_util.project_id
550			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
551			@click.option("--trials", "-T", default=10, help="Number of trials")
552			@click.option(
553			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
554			)
555			@click.option(
556			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
557			)
558			@click.option(
559			"--results-file",
560			"-r",
561			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
562			help="""Specify file path to write trial results as CSV.
563			File directory must exist, existing file will be overwritten.""",
564			)
565			@cli_util.docs_limit_option
566			@cli_util.common_options
567			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
568			"""
569			Optimize the hyperparameters of a project using validation documents from
570			``PATHS``. Not supported by all backends. Output is a list of trial results
571			and a report of the best performing parameters.
572			"""
573			proj = cli_util.get_project(project_id)
574			documents = cli_util.open_documents(
575			paths, proj.subjects, proj.vocab_lang, docs_limit
576			)
577			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
578			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
579			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
580			click.echo("---")
581			for line in rec.lines:
582			click.echo(line)
583			click.echo("---")
584
585
586			@cli.command("completion")
587			@click.option("--bash", "shell", flag_value="bash")
588			@click.option("--zsh", "shell", flag_value="zsh")
589			@click.option("--fish", "shell", flag_value="fish")
590			def run_completion(shell):
591			"""Generate the script for tab-key autocompletion for the given shell. To enable the
592			completion support in your current bash terminal session run\n
593			source <(annif completion --bash)
594			"""
595
596			if shell is None:
597			raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
598
599			script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
600			click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
601			click.echo(script)
602
603
604			if __name__ == "__main__":
605			cli()
606

NatLibFi / Annif

Push — upgrade-to-connexion3 ( e417e0...5d7ec9 )

annif.cli.run_completion() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like