annif.cli.run_app() - Code Metrics - Inspection of "Upgrade to Connexion3" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#702)

by Juho

created 2024-04-05 13:03 UTC

annif.cli.run_app() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	13
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	8
nop	1
dl	0
loc	13
rs	10
c	0
b	0
f	0

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""

import collections
import importlib
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

create_app = annif.create_flask_app
cli = FlaskGroup(
    create_app=create_app, add_default_commands=False, add_version_option=False
)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    column_headings = (
        "Project ID",
        "Project Name",
        "Vocabulary ID",
        "Language",
        "Trained",
        "Modification time",
    )
    table = [
        (
            proj.project_id,
            proj.name,
            proj.vocab.vocab_id if proj.vocab_spec else "-",
            proj.language,
            str(proj.is_trained),
            cli_util.format_datetime(proj.modification_time),
        )
        for proj in annif.registry.get_projects(min_access=Access.private).values()
    ]
    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("show-project")
@cli_util.project_id
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Backend:           {proj.backend.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")


@cli.command("clear")
@cli_util.project_id
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
    table = []
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        row = (vocab.vocab_id, languages, str(size), str(loaded))
        table.append(row)

    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("load-vocab")
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@cli_util.project_id
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@cli_util.project_id
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1:{fmt_spec}}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        if isinstance(score, int):
            fmt_spec = "d"
        elif isinstance(score, float):
            fmt_spec = ".04f"
        click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))

    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


@cli.command("run")
@click.option("--port", type=int)
@click.option("--log-level")
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_app(**kwargs):
    """
    Run Annif in server mode for development.
    \f
    The server is for development purposes only.
    """
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
    cxapp = annif.create_cx_app()
    cxapp.run(**kwargs)


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    from annif.suggestion import SuggestionResults

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
@click.option("--fish", "shell", flag_value="fish")
def run_completion(shell):
    """Generate the script for tab-key autocompletion for the given shell. To enable the
    completion support in your current bash terminal session run\n
        source <(annif completion --bash)
    """

    if shell is None:
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")

    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
    click.echo(script)


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4			import collections
5			import importlib
6			import json
7			import os.path
8			import re
9			import sys
10
11			import click
12			import click_log
13			from flask.cli import FlaskGroup
14
15			import annif
16			import annif.corpus
17			import annif.parallel
18			import annif.project
19			import annif.registry
20			from annif import cli_util
21			from annif.exception import NotInitializedException, NotSupportedException
22			from annif.project import Access
23			from annif.util import metric_code
24
25			logger = annif.logger
26			click_log.basic_config(logger)
27
28			create_app = annif.create_flask_app
29			cli = FlaskGroup(
30			create_app=create_app, add_default_commands=False, add_version_option=False
31			)
32			cli = click.version_option(message="%(version)s")(cli)
33
34
35			@cli.command("list-projects")
36			@cli_util.common_options
37			@click_log.simple_verbosity_option(logger, default="ERROR")
38			def run_list_projects():
39			"""
40			List available projects.
41			\f
42			Show a list of currently defined projects. Projects are defined in a
43			configuration file, normally called ``projects.cfg``. See `Project
44			configuration
45			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
46			for details.
47			"""
48
49			column_headings = (
50			"Project ID",
51			"Project Name",
52			"Vocabulary ID",
53			"Language",
54			"Trained",
55			"Modification time",
56			)
57			table = [
58			(
59			proj.project_id,
60			proj.name,
61			proj.vocab.vocab_id if proj.vocab_spec else "-",
62			proj.language,
63			str(proj.is_trained),
64			cli_util.format_datetime(proj.modification_time),
65			)
66			for proj in annif.registry.get_projects(min_access=Access.private).values()
67			]
68			template = cli_util.make_list_template(column_headings, *table)
69			header = template.format(*column_headings)
70			click.echo(header)
71			click.echo("-" * len(header))
72			for row in table:
73			click.echo(template.format(*row))
74
75
76			@cli.command("show-project")
77			@cli_util.project_id
78			@cli_util.common_options
79			def run_show_project(project_id):
80			"""
81			Show information about a project.
82			"""
83
84			proj = cli_util.get_project(project_id)
85			click.echo(f"Project ID: {proj.project_id}")
86			click.echo(f"Project Name: {proj.name}")
87			click.echo(f"Language: {proj.language}")
88			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
89			click.echo(f"Vocab language: {proj.vocab_lang}")
90			click.echo(f"Access: {proj.access.name}")
91			click.echo(f"Backend: {proj.backend.name}")
92			click.echo(f"Trained: {proj.is_trained}")
93			click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
94
95
96			@cli.command("clear")
97			@cli_util.project_id
98			@cli_util.common_options
99			def run_clear_project(project_id):
100			"""
101			Initialize the project to its original, untrained state.
102			"""
103			proj = cli_util.get_project(project_id)
104			proj.remove_model_data()
105
106
107			@cli.command("list-vocabs")
108			@cli_util.common_options
109			@click_log.simple_verbosity_option(logger, default="ERROR")
110			def run_list_vocabs():
111			"""
112			List available vocabularies.
113			"""
114
115			column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
116			table = []
117			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
118			try:
119			languages = ",".join(sorted(vocab.languages))
120			size = len(vocab)
121			loaded = True
122			except NotInitializedException:
123			languages = "-"
124			size = "-"
125			loaded = False
126			row = (vocab.vocab_id, languages, str(size), str(loaded))
127			table.append(row)
128
129			template = cli_util.make_list_template(column_headings, *table)
130			header = template.format(*column_headings)
131			click.echo(header)
132			click.echo("-" * len(header))
133			for row in table:
134			click.echo(template.format(*row))
135
136
137			@cli.command("load-vocab")
138			@click.argument("vocab_id", shell_complete=cli_util.complete_param)
139			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
140			@click.option("--language", "-L", help="Language of subject file")
141			@click.option(
142			"--force",
143			"-f",
144			default=False,
145			is_flag=True,
146			help="Replace existing vocabulary completely instead of updating it",
147			)
148			@cli_util.common_options
149			def run_load_vocab(vocab_id, language, force, subjectfile):
150			"""
151			Load a vocabulary from a subject file.
152			"""
153			vocab = cli_util.get_vocab(vocab_id)
154			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
155			# SKOS/RDF file supported by rdflib
156			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
157			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
158			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
159			# CSV file
160			subjects = annif.corpus.SubjectFileCSV(subjectfile)
161			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
162			else:
163			# probably a TSV file - we need to know its language
164			if not language:
165			click.echo(
166			"Please use --language option to set the language of a TSV vocabulary.",
167			err=True,
168			)
169			sys.exit(1)
170			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
171			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
172			vocab.load_vocabulary(subjects, force=force)
173
174
175			@cli.command("train")
176			@cli_util.project_id
177			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
178			@click.option(
179			"--cached/--no-cached",
180			"-c/-C",
181			default=False,
182			help="Reuse preprocessed training data from previous run",
183			)
184			@click.option(
185			"--jobs",
186			"-j",
187			default=0,
188			help="Number of parallel jobs (0 means choose automatically)",
189			)
190			@cli_util.docs_limit_option
191			@cli_util.backend_param_option
192			@cli_util.common_options
193			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
194			"""
195			Train a project on a collection of documents.
196			\f
197			This will train the project using the documents from ``PATHS`` (directories
198			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
199			is set, preprocessed training data from the previous run is reused instead
200			of documents input; see `Reusing preprocessed training data
201			<https://github.com/NatLibFi/Annif/wiki/
202			Reusing-preprocessed-training-data>`_.
203			"""
204			proj = cli_util.get_project(project_id)
205			backend_params = cli_util.parse_backend_params(backend_param, proj)
206			if cached:
207			if len(paths) > 0:
208			raise click.UsageError(
209			"Corpus paths cannot be given when using --cached option."
210			)
211			documents = "cached"
212			else:
213			documents = cli_util.open_documents(
214			paths, proj.subjects, proj.vocab_lang, docs_limit
215			)
216			proj.train(documents, backend_params, jobs)
217
218
219			@cli.command("learn")
220			@cli_util.project_id
221			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
222			@cli_util.docs_limit_option
223			@cli_util.backend_param_option
224			@cli_util.common_options
225			def run_learn(project_id, paths, docs_limit, backend_param):
226			"""
227			Further train an existing project on a collection of documents.
228			\f
229			Similar to the ``train`` command. This will continue training an already
230			trained project using the documents given by ``PATHS`` in a single batch
231			operation. Not supported by all backends.
232			"""
233			proj = cli_util.get_project(project_id)
234			backend_params = cli_util.parse_backend_params(backend_param, proj)
235			documents = cli_util.open_documents(
236			paths, proj.subjects, proj.vocab_lang, docs_limit
237			)
238			proj.learn(documents, backend_params)
239
240
241			@cli.command("suggest")
242			@cli_util.project_id
243			@click.argument(
244			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
245			)
246			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
247			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
248			@click.option("--language", "-L", help="Language of subject labels")
249			@cli_util.docs_limit_option
250			@cli_util.backend_param_option
251			@cli_util.common_options
252			def run_suggest(
253			project_id, paths, limit, threshold, language, backend_param, docs_limit
254			):
255			"""
256			Suggest subjects for a single document from standard input or for one or more
257			document file(s) given its/their path(s).
258			\f
259			This will read a text document from standard input and suggest subjects for
260			it, or if given path(s) to file(s), suggest subjects for it/them.
261			"""
262			project = cli_util.get_project(project_id)
263			lang = language or project.vocab_lang
264			if lang not in project.vocab.languages:
265			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
266			backend_params = cli_util.parse_backend_params(backend_param, project)
267
268			if paths and not (len(paths) == 1 and paths[0] == "-"):
269			docs = cli_util.open_text_documents(paths, docs_limit)
270			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
271			for (
272			suggestions,
273			path,
274			) in zip(results, paths):
275			click.echo(f"Suggestions for {path}")
276			cli_util.show_hits(suggestions, project, lang)
277			else:
278			text = sys.stdin.read()
279			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
280			0
281			]
282			cli_util.show_hits(suggestions, project, lang)
283
284
285			@cli.command("index")
286			@cli_util.project_id
287			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
288			@click.option(
289			"--suffix", "-s", default=".annif", help="File name suffix for result files"
290			)
291			@click.option(
292			"--force/--no-force",
293			"-f/-F",
294			default=False,
295			help="Force overwriting of existing result files",
296			)
297			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
298			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
299			@click.option("--language", "-L", help="Language of subject labels")
300			@cli_util.backend_param_option
301			@cli_util.common_options
302			def run_index(
303			project_id, directory, suffix, force, limit, threshold, language, backend_param
304			):
305			"""
306			Index a directory with documents, suggesting subjects for each document.
307			Write the results in TSV files with the given suffix (``.annif`` by
308			default).
309			"""
310			project = cli_util.get_project(project_id)
311			lang = language or project.vocab_lang
312			if lang not in project.vocab.languages:
313			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
314			backend_params = cli_util.parse_backend_params(backend_param, project)
315
316			documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
317			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
318
319			for (docfilename, _), suggestions in zip(documents, results):
320			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
321			if os.path.exists(subjectfilename) and not force:
322			click.echo(
323			"Not overwriting {} (use --force to override)".format(subjectfilename)
324			)
325			continue
326			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
327			cli_util.show_hits(suggestions, project, lang, file=subjfile)
328
329
330			@cli.command("eval")
331			@cli_util.project_id
332			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
333			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
334			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
335			@click.option(
336			"--metric",
337			"-m",
338			default=[],
339			multiple=True,
340			help="Metric to calculate (default: all)",
341			)
342			@click.option(
343			"--metrics-file",
344			"-M",
345			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
346			help="""Specify file in order to write evaluation metrics in JSON format.
347			File directory must exist, existing file will be overwritten.""",
348			)
349			@click.option(
350			"--results-file",
351			"-r",
352			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
353			help="""Specify file in order to write non-aggregated results per subject.
354			File directory must exist, existing file will be overwritten.""",
355			)
356			@click.option(
357			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
358			)
359			@cli_util.docs_limit_option
360			@cli_util.backend_param_option
361			@cli_util.common_options
362			def run_eval(
363			project_id,
364			paths,
365			limit,
366			threshold,
367			docs_limit,
368			metric,
369			metrics_file,
370			results_file,
371			jobs,
372			backend_param,
373			):
374			"""
375			Suggest subjects for documents and evaluate the results by comparing
376			against a gold standard.
377			\f
378			With this command the documents from ``PATHS`` (directories or possibly
379			gzipped TSV files) will be assigned subject suggestions and then
380			statistical measures are calculated that quantify how well the suggested
381			subjects match the gold-standard subjects in the documents.
382
383			Normally the output is the list of the metrics calculated across documents.
384			If ``--results-file <FILENAME>`` option is given, the metrics are
385			calculated separately for each subject, and written to the given file.
386			"""
387
388			project = cli_util.get_project(project_id)
389			backend_params = cli_util.parse_backend_params(backend_param, project)
390
391			import annif.eval
392
393			eval_batch = annif.eval.EvaluationBatch(project.subjects)
394
395			if results_file:
396			try:
397			print("", end="", file=results_file)
398			click.echo(
399			"Writing per subject evaluation results to {!s}".format(
400			results_file.name
401			)
402			)
403			except Exception as e:
404			raise NotSupportedException(
405			"cannot open results-file for writing: " + str(e)
406			)
407			corpus = cli_util.open_documents(
408			paths, project.subjects, project.vocab_lang, docs_limit
409			)
410			jobs, pool_class = annif.parallel.get_pool(jobs)
411
412			project.initialize(parallel=True)
413			psmap = annif.parallel.ProjectSuggestMap(
414			project.registry, [project_id], backend_params, limit, threshold
415			)
416
417			with pool_class(jobs) as pool:
418			for hit_sets, subject_sets in pool.imap_unordered(
419			psmap.suggest_batch, corpus.doc_batches
420			):
421			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
422
423			template = "{0:<30}\t{1:{fmt_spec}}"
424			metrics = eval_batch.results(
425			metrics=metric, results_file=results_file, language=project.vocab_lang
426			)
427			for metric, score in metrics.items():
428			if isinstance(score, int):
429			fmt_spec = "d"
430			elif isinstance(score, float):
431			fmt_spec = ".04f"
432			click.echo(template.format(metric + ":", score, fmt_spec=fmt_spec))
			0 ignored issues – show introduced 2023-05-12 11:10 UTC by Report Bug Copy Issue Report The variable `fmt_spec` does not seem to be defined for all execution paths. Loading history...
433			if metrics_file:
434			json.dump(
435			{metric_code(mname): val for mname, val in metrics.items()},
436			metrics_file,
437			indent=2,
438			)
439
440
441			@cli.command("run")
442			@click.option("--port", type=int)
443			@click.option("--log-level")
444			@click_log.simple_verbosity_option(logger, default="ERROR")
445			def run_app(**kwargs):
446			"""
447			Run Annif in server mode for development.
448			\f
449			The server is for development purposes only.
450			"""
451			kwargs = {k: v for k, v in kwargs.items() if v is not None}
452			cxapp = annif.create_cx_app()
453			cxapp.run(**kwargs)
454
455
456			FILTER_BATCH_MAX_LIMIT = 15
457			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
458
459
460			@cli.command("optimize")
461			@cli_util.project_id
462			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
463			@click.option(
464			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
465			)
466			@cli_util.docs_limit_option
467			@cli_util.backend_param_option
468			@cli_util.common_options
469			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
470			"""
471			Suggest subjects for documents, testing multiple limits and thresholds.
472			\f
473			This command will use different limit (maximum number of subjects) and
474			score threshold values when assigning subjects to each document given by
475			``PATHS`` and compare the results against the gold standard subjects in the
476			documents. The output is a list of parameter combinations and their scores.
477			From the output, you can determine the optimum limit and threshold
478			parameters depending on which measure you want to target.
479			"""
480			project = cli_util.get_project(project_id)
481			backend_params = cli_util.parse_backend_params(backend_param, project)
482			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
483
484			import annif.eval
485
486			corpus = cli_util.open_documents(
487			paths, project.subjects, project.vocab_lang, docs_limit
488			)
489
490			jobs, pool_class = annif.parallel.get_pool(jobs)
491
492			project.initialize(parallel=True)
493			psmap = annif.parallel.ProjectSuggestMap(
494			project.registry,
495			[project_id],
496			backend_params,
497			limit=FILTER_BATCH_MAX_LIMIT,
498			threshold=0.0,
499			)
500
501			ndocs = 0
502			suggestion_batches = []
503			subject_set_batches = []
504			with pool_class(jobs) as pool:
505			for suggestion_batch, subject_sets in pool.imap_unordered(
506			psmap.suggest_batch, corpus.doc_batches
507			):
508			ndocs += len(suggestion_batch[project_id])
509			suggestion_batches.append(suggestion_batch[project_id])
510			subject_set_batches.append(subject_sets)
511
512			from annif.suggestion import SuggestionResults
513
514			orig_suggestion_results = SuggestionResults(suggestion_batches)
515
516			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
517
518			best_scores = collections.defaultdict(float)
519			best_params = {}
520
521			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
522			import annif.eval
523
524			for limit, threshold in filter_params:
525			eval_batch = annif.eval.EvaluationBatch(project.subjects)
526			filtered_results = orig_suggestion_results.filter(limit, threshold)
527			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
528			eval_batch.evaluate_many(batch, subject_sets)
529			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
530			for metric, score in results.items():
531			if score >= best_scores[metric]:
532			best_scores[metric] = score
533			best_params[metric] = (limit, threshold)
534			click.echo(
535			template.format(
536			limit,
537			threshold,
538			results["Precision (doc avg)"],
539			results["Recall (doc avg)"],
540			results["F1 score (doc avg)"],
541			)
542			)
543
544			click.echo()
545			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
546			for metric in OPTIMIZE_METRICS:
547			click.echo(
548			template2.format(
549			metric,
550			best_scores[metric],
551			best_params[metric][0],
552			best_params[metric][1],
553			)
554			)
555			click.echo("Documents evaluated:\t{}".format(ndocs))
556
557
558			@cli.command("hyperopt")
559			@cli_util.project_id
560			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
561			@click.option("--trials", "-T", default=10, help="Number of trials")
562			@click.option(
563			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
564			)
565			@click.option(
566			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
567			)
568			@click.option(
569			"--results-file",
570			"-r",
571			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
572			help="""Specify file path to write trial results as CSV.
573			File directory must exist, existing file will be overwritten.""",
574			)
575			@cli_util.docs_limit_option
576			@cli_util.common_options
577			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
578			"""
579			Optimize the hyperparameters of a project using validation documents from
580			``PATHS``. Not supported by all backends. Output is a list of trial results
581			and a report of the best performing parameters.
582			"""
583			proj = cli_util.get_project(project_id)
584			documents = cli_util.open_documents(
585			paths, proj.subjects, proj.vocab_lang, docs_limit
586			)
587			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
588			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
589			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
590			click.echo("---")
591			for line in rec.lines:
592			click.echo(line)
593			click.echo("---")
594
595
596			@cli.command("completion")
597			@click.option("--bash", "shell", flag_value="bash")
598			@click.option("--zsh", "shell", flag_value="zsh")
599			@click.option("--fish", "shell", flag_value="fish")
600			def run_completion(shell):
601			"""Generate the script for tab-key autocompletion for the given shell. To enable the
602			completion support in your current bash terminal session run\n
603			source <(annif completion --bash)
604			"""
605
606			if shell is None:
607			raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
608
609			script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
610			click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
611			click.echo(script)
612
613
614			if __name__ == "__main__":
615			cli()
616

NatLibFi / Annif

Pull Request — main (#702)

annif.cli.run_app() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like