annif.cli.completion() - Code Metrics - Inspection of "Merge branch 'main' into issue686-cli-command-list..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — issue686-cli-command-list-proj... ( 267ee5...0cc9fe )

by Juho

created 2023-04-25 11:55 UTC

annif.cli.completion() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	16
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	10
nop	1
dl	0
loc	16
rs	9.9
c	0
b	0
f	0

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import importlib
import json
import os.path
import re
import sys

import click
import click_log
from flask.cli import FlaskGroup

import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif.project import Access
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message="%(version)s")(cli)


@cli.command("list-projects")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_projects():
    """
    List available projects.
    \f
    Show a list of currently defined projects. Projects are defined in a
    configuration file, normally called ``projects.cfg``. See `Project
    configuration
    <https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
    for details.
    """

    column_headings = (
        "Project ID",
        "Project Name",
        "Vocabulary ID",
        "Language",
        "Trained",
        "Modification time",
    )
    table = [
        (
            proj.project_id,
            proj.name,
            proj.vocab.vocab_id if proj.vocab_spec else "-",
            proj.language,
            str(proj.is_trained),
            cli_util.format_datetime(proj.modification_time),
        )
        for proj in annif.registry.get_projects(min_access=Access.private).values()
    ]
    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("show-project")
@cli_util.project_id
@cli_util.common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = cli_util.get_project(project_id)
    click.echo(f"Project ID:        {proj.project_id}")
    click.echo(f"Project Name:      {proj.name}")
    click.echo(f"Language:          {proj.language}")
    click.echo(f"Vocabulary:        {proj.vocab.vocab_id}")
    click.echo(f"Vocab language:    {proj.vocab_lang}")
    click.echo(f"Access:            {proj.access.name}")
    click.echo(f"Backend:           {proj.backend.name}")
    click.echo(f"Trained:           {proj.is_trained}")
    click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")


@cli.command("clear")
@cli_util.project_id
@cli_util.common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = cli_util.get_project(project_id)
    proj.remove_model_data()


@cli.command("list-vocabs")
@cli_util.common_options
@click_log.simple_verbosity_option(logger, default="ERROR")
def run_list_vocabs():
    """
    List available vocabularies.
    """

    column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
    table = []
    for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
        try:
            languages = ",".join(sorted(vocab.languages))
            size = len(vocab)
            loaded = True
        except NotInitializedException:
            languages = "-"
            size = "-"
            loaded = False
        row = (vocab.vocab_id, languages, str(size), str(loaded))
        table.append(row)

    template = cli_util.make_list_template(column_headings, *table)
    header = template.format(*column_headings)
    click.echo(header)
    click.echo("-" * len(header))
    for row in table:
        click.echo(template.format(*row))


@cli.command("load-vocab")
@click.argument("vocab_id", shell_complete=cli_util.complete_param)
@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
@click.option("--language", "-L", help="Language of subject file")
@click.option(
    "--force",
    "-f",
    default=False,
    is_flag=True,
    help="Replace existing vocabulary completely instead of updating it",
)
@cli_util.common_options
def run_load_vocab(vocab_id, language, force, subjectfile):
    """
    Load a vocabulary from a subject file.
    """
    vocab = cli_util.get_vocab(vocab_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile)
        click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
    elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
        # CSV file
        subjects = annif.corpus.SubjectFileCSV(subjectfile)
        click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
    else:
        # probably a TSV file - we need to know its language
        if not language:
            click.echo(
                "Please use --language option to set the language of a TSV vocabulary.",
                err=True,
            )
            sys.exit(1)
        click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
        subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
    vocab.load_vocabulary(subjects, force=force)


@cli.command("train")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--cached/--no-cached",
    "-c/-C",
    default=False,
    help="Reuse preprocessed training data from previous run",
)
@click.option(
    "--jobs",
    "-j",
    default=0,
    help="Number of parallel jobs (0 means choose automatically)",
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    \f
    This will train the project using the documents from ``PATHS`` (directories
    or possibly gzipped TSV files) in a single batch operation. If ``--cached``
    is set, preprocessed training data from the previous run is reused instead
    of documents input; see `Reusing preprocessed training data
    <https://github.com/NatLibFi/Annif/wiki/
    Reusing-preprocessed-training-data>`_.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option."
            )
        documents = "cached"
    else:
        documents = cli_util.open_documents(
            paths, proj.subjects, proj.vocab_lang, docs_limit
        )
    proj.train(documents, backend_params, jobs)


@cli.command("learn")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    \f
    Similar to the ``train`` command. This will continue training an already
    trained project using the documents given by ``PATHS`` in a single batch
    operation. Not supported by all backends.
    """
    proj = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, proj)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    proj.learn(documents, backend_params)


@cli.command("suggest")
@cli_util.project_id
@click.argument(
    "paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_suggest(
    project_id, paths, limit, threshold, language, backend_param, docs_limit
):
    """
    Suggest subjects for a single document from standard input or for one or more
    document file(s) given its/their path(s).
    \f
    This will read a text document from standard input and suggest subjects for
    it, or if given path(s) to file(s), suggest subjects for it/them.
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    if paths and not (len(paths) == 1 and paths[0] == "-"):
        docs = cli_util.open_text_documents(paths, docs_limit)
        results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
        for (
            suggestions,
            path,
        ) in zip(results, paths):
            click.echo(f"Suggestions for {path}")
            cli_util.show_hits(suggestions, project, lang)
    else:
        text = sys.stdin.read()
        suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
            0
        ]
        cli_util.show_hits(suggestions, project, lang)


@cli.command("index")
@cli_util.project_id
@click.argument("directory", type=click.Path(exists=True, file_okay=False))
@click.option(
    "--suffix", "-s", default=".annif", help="File name suffix for result files"
)
@click.option(
    "--force/--no-force",
    "-f/-F",
    default=False,
    help="Force overwriting of existing result files",
)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option("--language", "-L", help="Language of subject labels")
@cli_util.backend_param_option
@cli_util.common_options
def run_index(
    project_id, directory, suffix, force, limit, threshold, language, backend_param
):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix (``.annif`` by
    default).
    """
    project = cli_util.get_project(project_id)
    lang = language or project.vocab_lang
    if lang not in project.vocab.languages:
        raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
    backend_params = cli_util.parse_backend_params(backend_param, project)

    documents = annif.corpus.DocumentDirectory(
        directory, None, None, require_subjects=False
    )
    results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

    for (docfilename, _), suggestions in zip(documents, results):
        subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(subjectfilename)
            )
            continue
        with open(subjectfilename, "w", encoding="utf-8") as subjfile:
            cli_util.show_hits(suggestions, project, lang, file=subjfile)


@cli.command("eval")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
@click.option(
    "--metric",
    "-m",
    default=[],
    multiple=True,
    help="Metric to calculate (default: all)",
)
@click.option(
    "--metrics-file",
    "-M",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""",
)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_eval(
    project_id,
    paths,
    limit,
    threshold,
    docs_limit,
    metric,
    metrics_file,
    results_file,
    jobs,
    backend_param,
):
    """
    Suggest subjects for documents and evaluate the results by comparing
    against a gold standard.
    \f
    With this command the documents from ``PATHS`` (directories or possibly
    gzipped TSV files) will be assigned subject suggestions and then
    statistical measures are calculated that quantify how well the suggested
    subjects match the gold-standard subjects in the documents.

    Normally the output is the list of the metrics calculated across documents.
    If ``--results-file <FILENAME>`` option is given, the metrics are
    calculated separately for each subject, and written to the given file.
    """

    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)

    import annif.eval

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print("", end="", file=results_file)
            click.echo(
                "Writing per subject evaluation results to {!s}".format(
                    results_file.name
                )
            )
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e)
            )
    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )
    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold
    )

    with pool_class(jobs) as pool:
        for hit_sets, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            eval_batch.evaluate_many(hit_sets[project_id], subject_sets)

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(
        metrics=metric, results_file=results_file, language=project.vocab_lang
    )
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(mname): val for mname, val in metrics.items()},
            metrics_file,
            indent=2,
        )


FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]


@cli.command("optimize")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
)
@cli_util.docs_limit_option
@cli_util.backend_param_option
@cli_util.common_options
def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
    """
    Suggest subjects for documents, testing multiple limits and thresholds.
    \f
    This command will use different limit (maximum number of subjects) and
    score threshold values when assigning subjects to each document given by
    ``PATHS`` and compare the results against the gold standard subjects in the
    documents. The output is a list of parameter combinations and their scores.
    From the output, you can determine the optimum limit and threshold
    parameters depending on which measure you want to target.
    """
    project = cli_util.get_project(project_id)
    backend_params = cli_util.parse_backend_params(backend_param, project)
    filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)

    import annif.eval

    corpus = cli_util.open_documents(
        paths, project.subjects, project.vocab_lang, docs_limit
    )

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry,
        [project_id],
        backend_params,
        limit=FILTER_BATCH_MAX_LIMIT,
        threshold=0.0,
    )

    ndocs = 0
    suggestion_batches = []
    subject_set_batches = []
    with pool_class(jobs) as pool:
        for suggestion_batch, subject_sets in pool.imap_unordered(
            psmap.suggest_batch, corpus.doc_batches
        ):
            ndocs += len(suggestion_batch[project_id])
            suggestion_batches.append(suggestion_batch[project_id])
            subject_set_batches.append(subject_sets)

    from annif.suggestion import SuggestionResults

    orig_suggestion_results = SuggestionResults(suggestion_batches)

    click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    import annif.eval

    for limit, threshold in filter_params:
        eval_batch = annif.eval.EvaluationBatch(project.subjects)
        filtered_results = orig_suggestion_results.filter(limit, threshold)
        for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
            eval_batch.evaluate_many(batch, subject_sets)
        results = eval_batch.results(metrics=OPTIMIZE_METRICS)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = (limit, threshold)
        click.echo(
            template.format(
                limit,
                threshold,
                results["Precision (doc avg)"],
                results["Recall (doc avg)"],
                results["F1 score (doc avg)"],
            )
        )

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in OPTIMIZE_METRICS:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1],
            )
        )
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command("hyperopt")
@cli_util.project_id
@click.argument("paths", type=click.Path(exists=True), nargs=-1)
@click.option("--trials", "-T", default=10, help="Number of trials")
@click.option(
    "--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
)
@click.option(
    "--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
)
@click.option(
    "--results-file",
    "-r",
    type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""",
)
@cli_util.docs_limit_option
@cli_util.common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
    """
    Optimize the hyperparameters of a project using validation documents from
    ``PATHS``. Not supported by all backends. Output is a list of trial results
    and a report of the best performing parameters.
    """
    proj = cli_util.get_project(project_id)
    documents = cli_util.open_documents(
        paths, proj.subjects, proj.vocab_lang, docs_limit
    )
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
@click.option("--fish", "shell", flag_value="fish")
def completion(shell):
    """Generate the script for tab-key autocompletion for the given shell. To enable the
    completion support in your current bash terminal session run\n
        source <(annif completion --bash)
    """

    if shell is None:
        raise click.UsageError("Shell not given, try --bash, --zsh or --fish")

    script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
    click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
    click.echo(script)


if __name__ == "__main__":
    cli()


1			"""Definitions for command-line (Click) commands for invoking Annif
2			operations and printing the results to console."""
3
4
5			import collections
6			import importlib
7			import json
8			import os.path
9			import re
10			import sys
11
12			import click
13			import click_log
14			from flask.cli import FlaskGroup
15
16			import annif
17			import annif.corpus
18			import annif.parallel
19			import annif.project
20			import annif.registry
21			from annif import cli_util
22			from annif.exception import NotInitializedException, NotSupportedException
23			from annif.project import Access
24			from annif.util import metric_code
25
26			logger = annif.logger
27			click_log.basic_config(logger)
28
29			cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
30			cli = click.version_option(message="%(version)s")(cli)
31
32
33			@cli.command("list-projects")
34			@cli_util.common_options
35			@click_log.simple_verbosity_option(logger, default="ERROR")
36			def run_list_projects():
37			"""
38			List available projects.
39			\f
40			Show a list of currently defined projects. Projects are defined in a
41			configuration file, normally called ``projects.cfg``. See `Project
42			configuration
43			<https://github.com/NatLibFi/Annif/wiki/Project-configuration>`_
44			for details.
45			"""
46
47			column_headings = (
48			"Project ID",
49			"Project Name",
50			"Vocabulary ID",
51			"Language",
52			"Trained",
53			"Modification time",
54			)
55			table = [
56			(
57			proj.project_id,
58			proj.name,
59			proj.vocab.vocab_id if proj.vocab_spec else "-",
60			proj.language,
61			str(proj.is_trained),
62			cli_util.format_datetime(proj.modification_time),
63			)
64			for proj in annif.registry.get_projects(min_access=Access.private).values()
65			]
66			template = cli_util.make_list_template(column_headings, *table)
67			header = template.format(*column_headings)
68			click.echo(header)
69			click.echo("-" * len(header))
70			for row in table:
71			click.echo(template.format(*row))
72
73
74			@cli.command("show-project")
75			@cli_util.project_id
76			@cli_util.common_options
77			def run_show_project(project_id):
78			"""
79			Show information about a project.
80			"""
81
82			proj = cli_util.get_project(project_id)
83			click.echo(f"Project ID: {proj.project_id}")
84			click.echo(f"Project Name: {proj.name}")
85			click.echo(f"Language: {proj.language}")
86			click.echo(f"Vocabulary: {proj.vocab.vocab_id}")
87			click.echo(f"Vocab language: {proj.vocab_lang}")
88			click.echo(f"Access: {proj.access.name}")
89			click.echo(f"Backend: {proj.backend.name}")
90			click.echo(f"Trained: {proj.is_trained}")
91			click.echo(f"Modification time: {cli_util.format_datetime(proj.modification_time)}")
92
93
94			@cli.command("clear")
95			@cli_util.project_id
96			@cli_util.common_options
97			def run_clear_project(project_id):
98			"""
99			Initialize the project to its original, untrained state.
100			"""
101			proj = cli_util.get_project(project_id)
102			proj.remove_model_data()
103
104
105			@cli.command("list-vocabs")
106			@cli_util.common_options
107			@click_log.simple_verbosity_option(logger, default="ERROR")
108			def run_list_vocabs():
109			"""
110			List available vocabularies.
111			"""
112
113			column_headings = ("Vocabulary ID", "Languages", "Size", "Loaded")
114			table = []
115			for vocab in annif.registry.get_vocabs(min_access=Access.private).values():
116			try:
117			languages = ",".join(sorted(vocab.languages))
118			size = len(vocab)
119			loaded = True
120			except NotInitializedException:
121			languages = "-"
122			size = "-"
123			loaded = False
124			row = (vocab.vocab_id, languages, str(size), str(loaded))
125			table.append(row)
126
127			template = cli_util.make_list_template(column_headings, *table)
128			header = template.format(*column_headings)
129			click.echo(header)
130			click.echo("-" * len(header))
131			for row in table:
132			click.echo(template.format(*row))
133
134
135			@cli.command("load-vocab")
136			@click.argument("vocab_id", shell_complete=cli_util.complete_param)
137			@click.argument("subjectfile", type=click.Path(exists=True, dir_okay=False))
138			@click.option("--language", "-L", help="Language of subject file")
139			@click.option(
140			"--force",
141			"-f",
142			default=False,
143			is_flag=True,
144			help="Replace existing vocabulary completely instead of updating it",
145			)
146			@cli_util.common_options
147			def run_load_vocab(vocab_id, language, force, subjectfile):
148			"""
149			Load a vocabulary from a subject file.
150			"""
151			vocab = cli_util.get_vocab(vocab_id)
152			if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
153			# SKOS/RDF file supported by rdflib
154			subjects = annif.corpus.SubjectFileSKOS(subjectfile)
155			click.echo(f"Loading vocabulary from SKOS file {subjectfile}...")
156			elif annif.corpus.SubjectFileCSV.is_csv_file(subjectfile):
157			# CSV file
158			subjects = annif.corpus.SubjectFileCSV(subjectfile)
159			click.echo(f"Loading vocabulary from CSV file {subjectfile}...")
160			else:
161			# probably a TSV file - we need to know its language
162			if not language:
163			click.echo(
164			"Please use --language option to set the language of a TSV vocabulary.",
165			err=True,
166			)
167			sys.exit(1)
168			click.echo(f"Loading vocabulary from TSV file {subjectfile}...")
169			subjects = annif.corpus.SubjectFileTSV(subjectfile, language)
170			vocab.load_vocabulary(subjects, force=force)
171
172
173			@cli.command("train")
174			@cli_util.project_id
175			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
176			@click.option(
177			"--cached/--no-cached",
178			"-c/-C",
179			default=False,
180			help="Reuse preprocessed training data from previous run",
181			)
182			@click.option(
183			"--jobs",
184			"-j",
185			default=0,
186			help="Number of parallel jobs (0 means choose automatically)",
187			)
188			@cli_util.docs_limit_option
189			@cli_util.backend_param_option
190			@cli_util.common_options
191			def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
192			"""
193			Train a project on a collection of documents.
194			\f
195			This will train the project using the documents from ``PATHS`` (directories
196			or possibly gzipped TSV files) in a single batch operation. If ``--cached``
197			is set, preprocessed training data from the previous run is reused instead
198			of documents input; see `Reusing preprocessed training data
199			<https://github.com/NatLibFi/Annif/wiki/
200			Reusing-preprocessed-training-data>`_.
201			"""
202			proj = cli_util.get_project(project_id)
203			backend_params = cli_util.parse_backend_params(backend_param, proj)
204			if cached:
205			if len(paths) > 0:
206			raise click.UsageError(
207			"Corpus paths cannot be given when using --cached option."
208			)
209			documents = "cached"
210			else:
211			documents = cli_util.open_documents(
212			paths, proj.subjects, proj.vocab_lang, docs_limit
213			)
214			proj.train(documents, backend_params, jobs)
215
216
217			@cli.command("learn")
218			@cli_util.project_id
219			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
220			@cli_util.docs_limit_option
221			@cli_util.backend_param_option
222			@cli_util.common_options
223			def run_learn(project_id, paths, docs_limit, backend_param):
224			"""
225			Further train an existing project on a collection of documents.
226			\f
227			Similar to the ``train`` command. This will continue training an already
228			trained project using the documents given by ``PATHS`` in a single batch
229			operation. Not supported by all backends.
230			"""
231			proj = cli_util.get_project(project_id)
232			backend_params = cli_util.parse_backend_params(backend_param, proj)
233			documents = cli_util.open_documents(
234			paths, proj.subjects, proj.vocab_lang, docs_limit
235			)
236			proj.learn(documents, backend_params)
237
238
239			@cli.command("suggest")
240			@cli_util.project_id
241			@click.argument(
242			"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
243			)
244			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
245			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
246			@click.option("--language", "-L", help="Language of subject labels")
247			@cli_util.docs_limit_option
248			@cli_util.backend_param_option
249			@cli_util.common_options
250			def run_suggest(
251			project_id, paths, limit, threshold, language, backend_param, docs_limit
252			):
253			"""
254			Suggest subjects for a single document from standard input or for one or more
255			document file(s) given its/their path(s).
256			\f
257			This will read a text document from standard input and suggest subjects for
258			it, or if given path(s) to file(s), suggest subjects for it/them.
259			"""
260			project = cli_util.get_project(project_id)
261			lang = language or project.vocab_lang
262			if lang not in project.vocab.languages:
263			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
264			backend_params = cli_util.parse_backend_params(backend_param, project)
265
266			if paths and not (len(paths) == 1 and paths[0] == "-"):
267			docs = cli_util.open_text_documents(paths, docs_limit)
268			results = project.suggest_corpus(docs, backend_params).filter(limit, threshold)
269			for (
270			suggestions,
271			path,
272			) in zip(results, paths):
273			click.echo(f"Suggestions for {path}")
274			cli_util.show_hits(suggestions, project, lang)
275			else:
276			text = sys.stdin.read()
277			suggestions = project.suggest([text], backend_params).filter(limit, threshold)[
278			0
279			]
280			cli_util.show_hits(suggestions, project, lang)
281
282
283			@cli.command("index")
284			@cli_util.project_id
285			@click.argument("directory", type=click.Path(exists=True, file_okay=False))
286			@click.option(
287			"--suffix", "-s", default=".annif", help="File name suffix for result files"
288			)
289			@click.option(
290			"--force/--no-force",
291			"-f/-F",
292			default=False,
293			help="Force overwriting of existing result files",
294			)
295			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
296			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
297			@click.option("--language", "-L", help="Language of subject labels")
298			@cli_util.backend_param_option
299			@cli_util.common_options
300			def run_index(
301			project_id, directory, suffix, force, limit, threshold, language, backend_param
302			):
303			"""
304			Index a directory with documents, suggesting subjects for each document.
305			Write the results in TSV files with the given suffix (``.annif`` by
306			default).
307			"""
308			project = cli_util.get_project(project_id)
309			lang = language or project.vocab_lang
310			if lang not in project.vocab.languages:
311			raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
312			backend_params = cli_util.parse_backend_params(backend_param, project)
313
314			documents = annif.corpus.DocumentDirectory(
315			directory, None, None, require_subjects=False
316			)
317			results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
318
319			for (docfilename, _), suggestions in zip(documents, results):
320			subjectfilename = re.sub(r"\.txt$", suffix, docfilename)
321			if os.path.exists(subjectfilename) and not force:
322			click.echo(
323			"Not overwriting {} (use --force to override)".format(subjectfilename)
324			)
325			continue
326			with open(subjectfilename, "w", encoding="utf-8") as subjfile:
327			cli_util.show_hits(suggestions, project, lang, file=subjfile)
328
329
330			@cli.command("eval")
331			@cli_util.project_id
332			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
333			@click.option("--limit", "-l", default=10, help="Maximum number of subjects")
334			@click.option("--threshold", "-t", default=0.0, help="Minimum score threshold")
335			@click.option(
336			"--metric",
337			"-m",
338			default=[],
339			multiple=True,
340			help="Metric to calculate (default: all)",
341			)
342			@click.option(
343			"--metrics-file",
344			"-M",
345			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
346			help="""Specify file in order to write evaluation metrics in JSON format.
347			File directory must exist, existing file will be overwritten.""",
348			)
349			@click.option(
350			"--results-file",
351			"-r",
352			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
353			help="""Specify file in order to write non-aggregated results per subject.
354			File directory must exist, existing file will be overwritten.""",
355			)
356			@click.option(
357			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
358			)
359			@cli_util.docs_limit_option
360			@cli_util.backend_param_option
361			@cli_util.common_options
362			def run_eval(
363			project_id,
364			paths,
365			limit,
366			threshold,
367			docs_limit,
368			metric,
369			metrics_file,
370			results_file,
371			jobs,
372			backend_param,
373			):
374			"""
375			Suggest subjects for documents and evaluate the results by comparing
376			against a gold standard.
377			\f
378			With this command the documents from ``PATHS`` (directories or possibly
379			gzipped TSV files) will be assigned subject suggestions and then
380			statistical measures are calculated that quantify how well the suggested
381			subjects match the gold-standard subjects in the documents.
382
383			Normally the output is the list of the metrics calculated across documents.
384			If ``--results-file <FILENAME>`` option is given, the metrics are
385			calculated separately for each subject, and written to the given file.
386			"""
387
388			project = cli_util.get_project(project_id)
389			backend_params = cli_util.parse_backend_params(backend_param, project)
390
391			import annif.eval
392
393			eval_batch = annif.eval.EvaluationBatch(project.subjects)
394
395			if results_file:
396			try:
397			print("", end="", file=results_file)
398			click.echo(
399			"Writing per subject evaluation results to {!s}".format(
400			results_file.name
401			)
402			)
403			except Exception as e:
404			raise NotSupportedException(
405			"cannot open results-file for writing: " + str(e)
406			)
407			corpus = cli_util.open_documents(
408			paths, project.subjects, project.vocab_lang, docs_limit
409			)
410			jobs, pool_class = annif.parallel.get_pool(jobs)
411
412			project.initialize(parallel=True)
413			psmap = annif.parallel.ProjectSuggestMap(
414			project.registry, [project_id], backend_params, limit, threshold
415			)
416
417			with pool_class(jobs) as pool:
418			for hit_sets, subject_sets in pool.imap_unordered(
419			psmap.suggest_batch, corpus.doc_batches
420			):
421			eval_batch.evaluate_many(hit_sets[project_id], subject_sets)
422
423			template = "{0:<30}\t{1}"
424			metrics = eval_batch.results(
425			metrics=metric, results_file=results_file, language=project.vocab_lang
426			)
427			for metric, score in metrics.items():
428			click.echo(template.format(metric + ":", score))
429			if metrics_file:
430			json.dump(
431			{metric_code(mname): val for mname, val in metrics.items()},
432			metrics_file,
433			indent=2,
434			)
435
436
437			FILTER_BATCH_MAX_LIMIT = 15
438			OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
439
440
441			@cli.command("optimize")
442			@cli_util.project_id
443			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
444			@click.option(
445			"--jobs", "-j", default=1, help="Number of parallel jobs (0 means all CPUs)"
446			)
447			@cli_util.docs_limit_option
448			@cli_util.backend_param_option
449			@cli_util.common_options
450			def run_optimize(project_id, paths, jobs, docs_limit, backend_param):
451			"""
452			Suggest subjects for documents, testing multiple limits and thresholds.
453			\f
454			This command will use different limit (maximum number of subjects) and
455			score threshold values when assigning subjects to each document given by
456			``PATHS`` and compare the results against the gold standard subjects in the
457			documents. The output is a list of parameter combinations and their scores.
458			From the output, you can determine the optimum limit and threshold
459			parameters depending on which measure you want to target.
460			"""
461			project = cli_util.get_project(project_id)
462			backend_params = cli_util.parse_backend_params(backend_param, project)
463			filter_params = cli_util.generate_filter_params(FILTER_BATCH_MAX_LIMIT)
464
465			import annif.eval
466
467			corpus = cli_util.open_documents(
468			paths, project.subjects, project.vocab_lang, docs_limit
469			)
470
471			jobs, pool_class = annif.parallel.get_pool(jobs)
472
473			project.initialize(parallel=True)
474			psmap = annif.parallel.ProjectSuggestMap(
475			project.registry,
476			[project_id],
477			backend_params,
478			limit=FILTER_BATCH_MAX_LIMIT,
479			threshold=0.0,
480			)
481
482			ndocs = 0
483			suggestion_batches = []
484			subject_set_batches = []
485			with pool_class(jobs) as pool:
486			for suggestion_batch, subject_sets in pool.imap_unordered(
487			psmap.suggest_batch, corpus.doc_batches
488			):
489			ndocs += len(suggestion_batch[project_id])
490			suggestion_batches.append(suggestion_batch[project_id])
491			subject_set_batches.append(subject_sets)
492
493			from annif.suggestion import SuggestionResults
494
495			orig_suggestion_results = SuggestionResults(suggestion_batches)
496
497			click.echo("\t".join(("Limit", "Thresh.", "Prec.", "Rec.", "F1")))
498
499			best_scores = collections.defaultdict(float)
500			best_params = {}
501
502			template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
503			import annif.eval
504
505			for limit, threshold in filter_params:
506			eval_batch = annif.eval.EvaluationBatch(project.subjects)
507			filtered_results = orig_suggestion_results.filter(limit, threshold)
508			for batch, subject_sets in zip(filtered_results.batches, subject_set_batches):
509			eval_batch.evaluate_many(batch, subject_sets)
510			results = eval_batch.results(metrics=OPTIMIZE_METRICS)
511			for metric, score in results.items():
512			if score >= best_scores[metric]:
513			best_scores[metric] = score
514			best_params[metric] = (limit, threshold)
515			click.echo(
516			template.format(
517			limit,
518			threshold,
519			results["Precision (doc avg)"],
520			results["Recall (doc avg)"],
521			results["F1 score (doc avg)"],
522			)
523			)
524
525			click.echo()
526			template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
527			for metric in OPTIMIZE_METRICS:
528			click.echo(
529			template2.format(
530			metric,
531			best_scores[metric],
532			best_params[metric][0],
533			best_params[metric][1],
534			)
535			)
536			click.echo("Documents evaluated:\t{}".format(ndocs))
537
538
539			@cli.command("hyperopt")
540			@cli_util.project_id
541			@click.argument("paths", type=click.Path(exists=True), nargs=-1)
542			@click.option("--trials", "-T", default=10, help="Number of trials")
543			@click.option(
544			"--jobs", "-j", default=1, help="Number of parallel runs (0 means all CPUs)"
545			)
546			@click.option(
547			"--metric", "-m", default="NDCG", help="Metric to optimize (default: NDCG)"
548			)
549			@click.option(
550			"--results-file",
551			"-r",
552			type=click.File("w", encoding="utf-8", errors="ignore", lazy=True),
553			help="""Specify file path to write trial results as CSV.
554			File directory must exist, existing file will be overwritten.""",
555			)
556			@cli_util.docs_limit_option
557			@cli_util.common_options
558			def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_file):
559			"""
560			Optimize the hyperparameters of a project using validation documents from
561			``PATHS``. Not supported by all backends. Output is a list of trial results
562			and a report of the best performing parameters.
563			"""
564			proj = cli_util.get_project(project_id)
565			documents = cli_util.open_documents(
566			paths, proj.subjects, proj.vocab_lang, docs_limit
567			)
568			click.echo(f"Looking for optimal hyperparameters using {trials} trials")
569			rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
570			click.echo(f"Got best {metric} score {rec.score:.4f} with:")
571			click.echo("---")
572			for line in rec.lines:
573			click.echo(line)
574			click.echo("---")
575
576
577			@cli.command("completion")
578			@click.option("--bash", "shell", flag_value="bash")
579			@click.option("--zsh", "shell", flag_value="zsh")
580			@click.option("--fish", "shell", flag_value="fish")
581			def completion(shell):
582			"""Generate the script for tab-key autocompletion for the given shell. To enable the
583			completion support in your current bash terminal session run\n
584			source <(annif completion --bash)
585			"""
586
587			if shell is None:
588			raise click.UsageError("Shell not given, try --bash, --zsh or --fish")
589
590			script = os.popen(f"_ANNIF_COMPLETE={shell}_source annif").read()
591			click.echo(f"# Generated by Annif {importlib.metadata.version('annif')}")
592			click.echo(script)
593
594
595			if __name__ == "__main__":
596			cli()
597

NatLibFi / Annif

Push — issue686-cli-command-list-proj... ( 267ee5...0cc9fe )

annif.cli.completion() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like