annif.cli - Code Metrics - Inspection of "Merge branch 'main' into lazy-imports" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — lazy-imports ( f5a695...70018e )

by Juho

created 2023-04-26 07:09 UTC

annif.cli B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	582
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	46
eloc	381
dl	0
loc	582
rs	8.72
c	0
b	0
f	0

13 Functions

Rating	Name	Size	Complexity
A	run_hyperopt()	36	2
A	run_train()	42	3
C	run_optimize()	96	8
A	run_load_vocab()	36	4
B	run_suggest()	42	6
A	run_learn()	20	1
A	completion()	16	2
A	run_list_vocabs()	22	3
A	run_list_projects()	22	2
B	run_index()	45	6
A	run_clear_project()	9	1
C	run_eval()	104	7
A	run_show_project()	17	1

How to fix Complexity

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import importlib
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)


if len(sys.argv) > 1 and sys.argv[1] == "run":
    create_app = annif.create_app  # Use Flask with Connexion
else:
    # Connexion is not needed for most CLI commands, use plain Flask
    create_app = annif.create_flask_app

cli = FlaskGroup(create_app=create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format("Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(min_access=Access.private).values():
        click.echo(
            template.format(
                proj.project_id, proj.name, proj.language, str(proj.is_trained)
            )
        )


@cli.command("show-project")
@cli_util.project_id
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {proj.modification_time}")


@cli.command("clear")
@cli_util.project_id
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    template = "{0: <20}{1: <20}{2: >10}  {3: <6}"
    header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
    click.echo(header)
    click.echo("-" * len(header))
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))


@cli.command("load-vocab")
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@cli_util.project_id
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@cli_util.project_id
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(
        directory, None, None, require_subjects=False
    )
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    from annif.suggestion import SuggestionResults

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
@click.option("--fish", "shell", flag_value="fish")
def completion(shell):
    """Generate the script for tab-key autocompletion for the given shell. To enable the
    completion support in your current bash terminal session run\n
        source <(annif completion --bash)
    """

    if shell is None:
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")

    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
    click.echo(script)


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import importlib
7			import json
8			import os.path
9			import re
10			import sys
11
12			import click
13			import click_log
14			from flask.cli import FlaskGroup
15
16			import annif
17			import annif.corpus
18			import annif.parallel
19			import annif.project
20			import annif.registry
21			from annif import cli_util
22			from annif.exception import NotInitializedException, NotSupportedException
23			from annif.project import Access
24			from annif.util import metric_code
25
26			logger = annif.logger
27			click_log.basic_config(logger)
28
29
30			if len(sys.argv) > 1 and sys.argv[1] == "run":
31			create_app = annif.create_app # Use Flask with Connexion
32			else:
33			# Connexion is not needed for most CLI commands, use plain Flask
34			create_app = annif.create_flask_app
35
36			cli = FlaskGroup(create_app=create_app, add_version_option=False)
37			cli = click.version_option(message="%(version)s")(cli)
38
39
40			@cli.command("list-projects")
41			@cli_util.common_options
42			@click_log.simple_verbosity_option(logger, default="ERROR")
43			def run_list_projects():
44			"""
45			List available projects.
46			\f
47			Show a list of currently defined projects. Projects are defined in a
48			configuration file, normally called ``projects.cfg``. See `Project
49			configuration
50			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
51			for details.
52			"""
53
54			template = "{0: <25}{1: <45}{2: <10}{3: <7}"
55			header = template.format("Project ID", "Project Name", "Language", "Trained")
56			click.echo(header)
57			click.echo("-" * len(header))
58			for proj in annif.registry.get_projects(min_access=Access.private).values():
59			click.echo(
60			template.format(
61			proj.project_id, proj.name, proj.language, str(proj.is_trained)
62			)
63			)
64
65
66			@cli.command("show-project")
67			@cli_util.project_id
68			@cli_util.common_options
69			def run_show_project(project_id):
70			"""
71			Show information about a project.
72			"""
73
74			proj = cli_util.get_project(project_id)
75			click.echo(f"Project ID: {proj.project_id}")
76			click.echo(f"Project Name: {proj.name}")
77			click.echo(f"Language: {proj.language}")
78			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
79			click.echo(f"Vocab language: {proj.vocab_lang}")
80			click.echo(f"Access: {proj.access.name}")
81			click.echo(f"Trained: {proj.is_trained}")
82			click.echo(f"Modification time: {proj.modification_time}")
83
84
85			@cli.command("clear")
86			@cli_util.project_id
87			@cli_util.common_options
88			def run_clear_project(project_id):
89			"""
90			Initialize the project to its original, untrained state.
91			"""
92			proj = cli_util.get_project(project_id)
93			proj.remove_model_data()
94
95
96			@cli.command("list-vocabs")
97			@cli_util.common_options
98			@click_log.simple_verbosity_option(logger, default="ERROR")
99			def run_list_vocabs():
100			"""
101			List available vocabularies.
102			"""
103
104			template = "{0: <20}{1: <20}{2: >10} {3: <6}"
105			header = template.format("Vocabulary ID", "Languages", "Size", "Loaded")
106			click.echo(header)
107			click.echo("-" * len(header))
108			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
109			try:
110			languages = ",".join(sorted(vocab.languages))
111			size = len(vocab)
112			loaded = True
113			except NotInitializedException:
114			languages = "-"
115			size = "-"
116			loaded = False
117			click.echo(template.format(vocab.vocab_id, languages, size, str(loaded)))
118
119
120			@cli.command("load-vocab")
121			@click.argument("vocab_id", shell_complete=cli_util.complete_param)
122			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
123			@click.option("--language", "-L", help="Language of subject file")
124			@click.option(
125			"--force",
126			"-f",
127			default=False,
128			is_flag=True,
129			help="Replace existing vocabulary completely instead of updating it",
130			)
131			@cli_util.common_options
132			def run_load_vocab(vocab_id, language, force, subjectfile):
133			"""
134			Load a vocabulary from a subject file.
135			"""
136			vocab = cli_util.get_vocab(vocab_id)
137			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
138			# SKOS/RDF file supported by rdflib
139			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
140			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
141			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
142			# CSV file
143			subjects = annif.corpus.SubjectFileCSV(subjectfile)
144			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
145			else:
146			# probably a TSV file - we need to know its language
147			if not language:
148			click.echo(
149			"Please use --language option to set the language of a TSV vocabulary.",
150			err=True,
151			)
152			sys.exit(1)
153			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
154			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
155			vocab.load_vocabulary(subjects, force=force)
156
157
158			@cli.command("train")
159			@cli_util.project_id
160			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
161			@click.option(
162			"--cached/--no-cached",
163			"-c/-C",
164			default=False,
165			help="Reuse preprocessed training data from previous run",
166			)
167			@click.option(
168			"--jobs",
169			"-j",
170			default=0,
171			help="Number of parallel jobs (0 means choose automatically)",
172			)
173			@cli_util.docs_limit_option
174			@cli_util.backend_param_option
175			@cli_util.common_options
176			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
177			"""
178			Train a project on a collection of documents.
179			\f
180			This will train the project using the documents from ``PATHS`` (directories
181			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
182			is set, preprocessed training data from the previous run is reused instead
183			of documents input; see `Reusing preprocessed training data
184			<https://github.com/NatLibFi/Annif/wiki/
185			Reusing-preprocessed-training-data>`_.
186			"""
187			proj = cli_util.get_project(project_id)
188			backend_params = cli_util.parse_backend_params(backend_param, proj)
189			if cached:
190			if len(paths) > 0:
191			raise click.UsageError(
192			"Corpus paths cannot be given when using --cached option."
193			)
194			documents = "cached"
195			else:
196			documents = cli_util.open_documents(
197			paths, proj.subjects, proj.vocab_lang, docs_limit
198			)
199			proj.train(documents, backend_params, jobs)
200
201
202			@cli.command("learn")
203			@cli_util.project_id
204			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
205			@cli_util.docs_limit_option
206			@cli_util.backend_param_option
207			@cli_util.common_options
208			def run_learn(project_id, paths, docs_limit, backend_param):
209			"""
210			Further train an existing project on a collection of documents.
211			\f
212			Similar to the ``train`` command. This will continue training an already
213			trained project using the documents given by ``PATHS`` in a single batch
214			operation. Not supported by all backends.
215			"""
216			proj = cli_util.get_project(project_id)
217			backend_params = cli_util.parse_backend_params(backend_param, proj)
218			documents = cli_util.open_documents(
219			paths, proj.subjects, proj.vocab_lang, docs_limit
220			)
221			proj.learn(documents, backend_params)
222
223
224			@cli.command("suggest")
225			@cli_util.project_id
226			@click.argument(
227			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
228			)
229			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
230			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
231			@click.option("--language", "-L", help="Language of subject labels")
232			@cli_util.docs_limit_option
233			@cli_util.backend_param_option
234			@cli_util.common_options
235			def run_suggest(
236			project_id, paths, limit, threshold, language, backend_param, docs_limit
237			):
238			"""
239			Suggest subjects for a single document from standard input or for one or more
240			document file(s) given its/their path(s).
241			\f
242			This will read a text document from standard input and suggest subjects for
243			it, or if given path(s) to file(s), suggest subjects for it/them.
244			"""
245			project = cli_util.get_project(project_id)
246			lang = language or project.vocab_lang
247			if lang not in project.vocab.languages:
248			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
249			backend_params = cli_util.parse_backend_params(backend_param, project)
250
251			if paths and not (len(paths) == 1 and paths[0] == "-"):
252			docs = cli_util.open_text_documents(paths, docs_limit)
253			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
254			for (
255			suggestions,
256			path,
257			) in zip(results, paths):
258			click.echo(f"Suggestions for {path}")
259			cli_util.show_hits(suggestions, project, lang)
260			else:
261			text = sys.stdin.read()
262			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
263			0
264			]
265			cli_util.show_hits(suggestions, project, lang)
266
267
268			@cli.command("index")
269			@cli_util.project_id
270			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
271			@click.option(
272			"--suffix", "-s", default=".annif", help="File name suffix for result files"
273			)
274			@click.option(
275			"--force/--no-force",
276			"-f/-F",
277			default=False,
278			help="Force overwriting of existing result files",
279			)
280			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
281			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
282			@click.option("--language", "-L", help="Language of subject labels")
283			@cli_util.backend_param_option
284			@cli_util.common_options
285			def run_index(
286			project_id, directory, suffix, force, limit, threshold, language, backend_param
287			):
288			"""
289			Index a directory with documents, suggesting subjects for each document.
290			Write the results in TSV files with the given suffix (``.annif`` by
291			default).
292			"""
293			project = cli_util.get_project(project_id)
294			lang = language or project.vocab_lang
295			if lang not in project.vocab.languages:
296			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
297			backend_params = cli_util.parse_backend_params(backend_param, project)
298
299			documents = annif.corpus.DocumentDirectory(
300			directory, None, None, require_subjects=False
301			)
302			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
303
304			for (docfilename, _), suggestions in zip(documents, results):
305			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
306			if os.path.exists(subjectfilename) and not force:
307			click.echo(
308			"Not overwriting {} (use --force to override)".format(subjectfilename)
309			)
310			continue
311			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
312			cli_util.show_hits(suggestions, project, lang, file=subjfile)
313
314
315			@cli.command("eval")
316			@cli_util.project_id
317			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
318			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
319			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
320			@click.option(
321			"--metric",
322			"-m",
323			default=[],
324			multiple=True,
325			help="Metric to calculate (default: all)",
326			)
327			@click.option(
328			"--metrics-file",
329			"-M",
330			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
331			help="""Specify file in order to write evaluation metrics in JSON format.
332			File directory must exist, existing file will be overwritten.""",
333			)
334			@click.option(
335			"--results-file",
336			"-r",
337			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
338			help="""Specify file in order to write non-aggregated results per subject.
339			File directory must exist, existing file will be overwritten.""",
340			)
341			@click.option(
342			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
343			)
344			@cli_util.docs_limit_option
345			@cli_util.backend_param_option
346			@cli_util.common_options
347			def run_eval(
348			project_id,
349			paths,
350			limit,
351			threshold,
352			docs_limit,
353			metric,
354			metrics_file,
355			results_file,
356			jobs,
357			backend_param,
358			):
359			"""
360			Suggest subjects for documents and evaluate the results by comparing
361			against a gold standard.
362			\f
363			With this command the documents from ``PATHS`` (directories or possibly
364			gzipped TSV files) will be assigned subject suggestions and then
365			statistical measures are calculated that quantify how well the suggested
366			subjects match the gold-standard subjects in the documents.
367
368			Normally the output is the list of the metrics calculated across documents.
369			If ``--results-file <FILENAME>`` option is given, the metrics are
370			calculated separately for each subject, and written to the given file.
371			"""
372
373			project = cli_util.get_project(project_id)
374			backend_params = cli_util.parse_backend_params(backend_param, project)
375
376			import annif.eval
377
378			eval_batch = annif.eval.EvaluationBatch(project.subjects)
379
380			if results_file:
381			try:
382			print("", end="", file=results_file)
383			click.echo(
384			"Writing per subject evaluation results to {!s}".format(
385			results_file.name
386			)
387			)
388			except Exception as e:
389			raise NotSupportedException(
390			"cannot open results-file for writing: " + str(e)
391			)
392			corpus = cli_util.open_documents(
393			paths, project.subjects, project.vocab_lang, docs_limit
394			)
395			jobs, pool_class = annif.parallel.get_pool(jobs)
396
397			project.initialize(parallel=True)
398			psmap = annif.parallel.ProjectSuggestMap(
399			project.registry, [project_id], backend_params, limit, threshold
400			)
401
402			with pool_class(jobs) as pool:
403			for hit_sets, subject_sets in pool.imap_unordered(
404			psmap.suggest_batch, corpus.doc_batches
405			):
406			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
407
408			template = "{0:<30}\t{1}"
409			metrics = eval_batch.results(
410			metrics=metric, results_file=results_file, language=project.vocab_lang
411			)
412			for metric, score in metrics.items():
413			click.echo(template.format(metric + ":", score))
414			if metrics_file:
415			json.dump(
416			{metric_code(mname): val for mname, val in metrics.items()},
417			metrics_file,
418			indent=2,
419			)
420
421
422			FILTER_BATCH_MAX_LIMIT = 15
423			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
424
425
426			@cli.command("optimize")
427			@cli_util.project_id
428			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
429			@click.option(
430			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
431			)
432			@cli_util.docs_limit_option
433			@cli_util.backend_param_option
434			@cli_util.common_options
435			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
436			"""
437			Suggest subjects for documents, testing multiple limits and thresholds.
438			\f
439			This command will use different limit (maximum number of subjects) and
440			score threshold values when assigning subjects to each document given by
441			``PATHS`` and compare the results against the gold standard subjects in the
442			documents. The output is a list of parameter combinations and their scores.
443			From the output, you can determine the optimum limit and threshold
444			parameters depending on which measure you want to target.
445			"""
446			project = cli_util.get_project(project_id)
447			backend_params = cli_util.parse_backend_params(backend_param, project)
448			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
449
450			import annif.eval
451
452			corpus = cli_util.open_documents(
453			paths, project.subjects, project.vocab_lang, docs_limit
454			)
455
456			jobs, pool_class = annif.parallel.get_pool(jobs)
457
458			project.initialize(parallel=True)
459			psmap = annif.parallel.ProjectSuggestMap(
460			project.registry,
461			[project_id],
462			backend_params,
463			limit=FILTER_BATCH_MAX_LIMIT,
464			threshold=0.0,
465			)
466
467			ndocs = 0
468			suggestion_batches = []
469			subject_set_batches = []
470			with pool_class(jobs) as pool:
471			for suggestion_batch, subject_sets in pool.imap_unordered(
472			psmap.suggest_batch, corpus.doc_batches
473			):
474			ndocs += len(suggestion_batch[project_id])
475			suggestion_batches.append(suggestion_batch[project_id])
476			subject_set_batches.append(subject_sets)
477
478			from annif.suggestion import SuggestionResults
479
480			orig_suggestion_results = SuggestionResults(suggestion_batches)
481
482			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
483
484			best_scores = collections.defaultdict(float)
485			best_params = {}
486
487			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
488			import annif.eval
489
490			for limit, threshold in filter_params:
491			eval_batch = annif.eval.EvaluationBatch(project.subjects)
492			filtered_results = orig_suggestion_results.filter(limit, threshold)
493			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
494			eval_batch.evaluate_many(batch, subject_sets)
495			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
496			for metric, score in results.items():
497			if score >= best_scores[metric]:
498			best_scores[metric] = score
499			best_params[metric] = (limit, threshold)
500			click.echo(
501			template.format(
502			limit,
503			threshold,
504			results["Precision (doc avg)"],
505			results["Recall (doc avg)"],
506			results["F1 score (doc avg)"],
507			)
508			)
509
510			click.echo()
511			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
512			for metric in OPTIMIZE_METRICS:
513			click.echo(
514			template2.format(
515			metric,
516			best_scores[metric],
517			best_params[metric][0],
518			best_params[metric][1],
519			)
520			)
521			click.echo("Documents evaluated:\t{}".format(ndocs))
522
523
524			@cli.command("hyperopt")
525			@cli_util.project_id
526			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
527			@click.option("--trials", "-T", default=10, help="Number of trials")
528			@click.option(
529			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
530			)
531			@click.option(
532			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
533			)
534			@click.option(
535			"--results-file",
536			"-r",
537			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
538			help="""Specify file path to write trial results as CSV.
539			File directory must exist, existing file will be overwritten.""",
540			)
541			@cli_util.docs_limit_option
542			@cli_util.common_options
543			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
544			"""
545			Optimize the hyperparameters of a project using validation documents from
546			``PATHS``. Not supported by all backends. Output is a list of trial results
547			and a report of the best performing parameters.
548			"""
549			proj = cli_util.get_project(project_id)
550			documents = cli_util.open_documents(
551			paths, proj.subjects, proj.vocab_lang, docs_limit
552			)
553			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
554			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
555			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
556			click.echo("---")
557			for line in rec.lines:
558			click.echo(line)
559			click.echo("---")
560
561
562			@cli.command("completion")
563			@click.option("--bash", "shell", flag_value="bash")
564			@click.option("--zsh", "shell", flag_value="zsh")
565			@click.option("--fish", "shell", flag_value="fish")
566			def completion(shell):
567			"""Generate the script for tab-key autocompletion for the given shell. To enable the
568			completion support in your current bash terminal session run\n
569			source <(annif completion --bash)
570			"""
571
572			if shell is None:
573			raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
574
575			script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
576			click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
577			click.echo(script)
578
579
580			if __name__ == "__main__":
581			cli()
582

NatLibFi / Annif

Push — lazy-imports ( f5a695...70018e )

annif.cli B

Complexity

Size/Duplication

Importance

13 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like