annif.cli.common_options() - Code Metrics - Inspection of "Output eval metrics as a JSON file compatible with..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#557)

by Osma

created 2022-01-27 11:54 UTC

annif.cli.common_options() A

↳ Parent: annif.cli

Complexity

Conditions

Size

Total Lines	8
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	7
dl	0
loc	8
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import json
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter, ListSuggestionResult
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message='%(version)s')(cli)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.registry.get_project(project_id,
                                          min_access=Access.private)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths, docs_limit):

    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus or
    LimitingDocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    if docs_limit is not None:
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
    return docs


def parse_backend_params(backend_param, project):

    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


BATCH_MAX_LIMIT = 15


def generate_filter_batches(subjects):
    import annif.eval
    filter_batches = collections.OrderedDict()
    for limit in range(1, BATCH_MAX_LIMIT + 1):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(subjects, limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')

@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format(
        "Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(
            min_access=Access.private).values():
        click.echo(template.format(
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f'Project ID:        {proj.project_id}')
    click.echo(f'Project Name:      {proj.name}')
    click.echo(f'Language:          {proj.language}')
    click.echo(f'Access:            {proj.access.name}')
    click.echo(f'Trained:           {proj.is_trained}')
    click.echo(f'Modification time: {proj.modification_time}')


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')

@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', '-c/-C', default=False,
              help='Reuse preprocessed training data from previous run')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--jobs',
              '-j',
              default=0,
              help='Number of parallel jobs (0 means choose automatically)')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths, docs_limit)
    proj.train(documents, backend_params, jobs)


@cli.command('learn')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths, docs_limit)
    proj.learn(documents, backend_params)


@cli.command('suggest')

@click.argument('project_id')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits.as_list(project.subjects):
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')

@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    '-s',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', '-f/-F', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option(
    '--metrics-file',
    '-M',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""")
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""")
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel jobs (0 means all CPUs)')
@backend_param_option
@common_options
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        docs_limit,
        metrics_file,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    import annif.eval
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths, docs_limit)

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits[project_id],
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(results_file=results_file)
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        print(metrics)
        json.dump(metrics, metrics_file, indent=2)


@cli.command('optimize')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_optimize(project_id, paths, docs_limit, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths, docs_limit)
    for doc in docs.documents:
        raw_hits = project.suggest(doc.text, backend_params)
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
        assert isinstance(hits, ListSuggestionResult), \
            "Optimize should only be done with ListSuggestionResult " + \
            "as it would be very slow with VectorSuggestionResult."
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ['Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)']
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--trials', '-T', default=10, help='Number of trials')
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel runs (0 means all CPUs)')
@click.option('--metric', '-m', default='NDCG',
              help='Metric to optimize (default: NDCG)')
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""")
@common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
                 results_file):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths, docs_limit)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == '__main__':
    cli()


1		"""Definitions for command-line (Click) commands for invoking Annif
2		operations and printing the results to console."""
3
4
5		import collections
6		import os.path
7		import re
8		import sys
9		import json
10		import click
11		import click_log
12		from flask import current_app
13		from flask.cli import FlaskGroup, ScriptInfo
14		import annif
15		import annif.corpus
16		import annif.parallel
17		import annif.project
18		import annif.registry
19		from annif.project import Access
20		from annif.suggestion import SuggestionFilter, ListSuggestionResult
21		from annif.exception import ConfigurationException, NotSupportedException
22
23		logger = annif.logger
24		click_log.basic_config(logger)
25
26		cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
27		cli = click.version_option(message='%(version)s')(cli)
28
29
30		def get_project(project_id):
31		"""
32		Helper function to get a project by ID and bail out if it doesn't exist"""
33		try:
34		return annif.registry.get_project(project_id,
35		min_access=Access.private)
36		except ValueError:
37		click.echo(
38		"No projects found with id \'{0}\'.".format(project_id),
39		err=True)
40		sys.exit(1)
41
42
43	View Code Duplication	def open_documents(paths, docs_limit):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
44		"""Helper function to open a document corpus from a list of pathnames,
45		each of which is either a TSV file or a directory of TXT files. The
46		corpus will be returned as an instance of DocumentCorpus or
47		LimitingDocumentCorpus."""
48
49		def open_doc_path(path):
50		"""open a single path and return it as a DocumentCorpus"""
51		if os.path.isdir(path):
52		return annif.corpus.DocumentDirectory(path, require_subjects=True)
53		return annif.corpus.DocumentFile(path)
54
55		if len(paths) == 0:
56		logger.warning('Reading empty file')
57		docs = open_doc_path(os.path.devnull)
58		elif len(paths) == 1:
59		docs = open_doc_path(paths[0])
60		else:
61		corpora = [open_doc_path(path) for path in paths]
62		docs = annif.corpus.CombinedCorpus(corpora)
63		if docs_limit is not None:
64		docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
65		return docs
66
67
68	View Code Duplication	def parse_backend_params(backend_param, project):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
69		"""Parse a list of backend parameters given with the --backend-param
70		option into a nested dict structure"""
71		backend_params = collections.defaultdict(dict)
72		for beparam in backend_param:
73		backend, param = beparam.split('.', 1)
74		key, val = param.split('=', 1)
75		validate_backend_params(backend, beparam, project)
76		backend_params[backend][key] = val
77		return backend_params
78
79
80		def validate_backend_params(backend, beparam, project):
81		if backend != project.config['backend']:
82		raise ConfigurationException(
83		'The backend {} in CLI option "-b {}" not matching the project'
84		' backend {}.'
85		.format(backend, beparam, project.config['backend']))
86
87
88		BATCH_MAX_LIMIT = 15
89
90
91		def generate_filter_batches(subjects):
92		import annif.eval
93		filter_batches = collections.OrderedDict()
94		for limit in range(1, BATCH_MAX_LIMIT + 1):
95		for threshold in [i * 0.05 for i in range(20)]:
96		hit_filter = SuggestionFilter(subjects, limit, threshold)
97		batch = annif.eval.EvaluationBatch(subjects)
98		filter_batches[(limit, threshold)] = (hit_filter, batch)
99		return filter_batches
100
101
102		def set_project_config_file_path(ctx, param, value):
103		"""Override the default path or the path given in env by CLI option"""
104		with ctx.ensure_object(ScriptInfo).load_app().app_context():
105		if value:
106		current_app.config['PROJECTS_FILE'] = value
107
108
109		def common_options(f):
110		"""Decorator to add common options for all CLI commands"""
111		f = click.option(
112		'-p', '--projects', help='Set path to projects.cfg',
113		type=click.Path(dir_okay=False, exists=True),
114		callback=set_project_config_file_path, expose_value=False,
115		is_eager=True)(f)
116		return click_log.simple_verbosity_option(logger)(f)
117
118
119		def backend_param_option(f):
120		"""Decorator to add an option for CLI commands to override BE parameters"""
121		return click.option(
122		'--backend-param', '-b', multiple=True,
123		help='Override backend parameter of the config file. ' +
124		'Syntax: "-b <backend>.<parameter>=<value>".')(f)
125
126
127	View Code Duplication	@cli.command('list-projects')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
128		@common_options
129		@click_log.simple_verbosity_option(logger, default='ERROR')
130		def run_list_projects():
131		"""
132		List available projects.
133		"""
134
135		template = "{0: <25}{1: <45}{2: <10}{3: <7}"
136		header = template.format(
137		"Project ID", "Project Name", "Language", "Trained")
138		click.echo(header)
139		click.echo("-" * len(header))
140		for proj in annif.registry.get_projects(
141		min_access=Access.private).values():
142		click.echo(template.format(
143		proj.project_id, proj.name, proj.language, str(proj.is_trained)))
144
145
146		@cli.command('show-project')
147		@click.argument('project_id')
148		@common_options
149		def run_show_project(project_id):
150		"""
151		Show information about a project.
152		"""
153
154		proj = get_project(project_id)
155		click.echo(f'Project ID: {proj.project_id}')
156		click.echo(f'Project Name: {proj.name}')
157		click.echo(f'Language: {proj.language}')
158		click.echo(f'Access: {proj.access.name}')
159		click.echo(f'Trained: {proj.is_trained}')
160		click.echo(f'Modification time: {proj.modification_time}')
161
162
163		@cli.command('clear')
164		@click.argument('project_id')
165		@common_options
166		def run_clear_project(project_id):
167		"""
168		Initialize the project to its original, untrained state.
169		"""
170		proj = get_project(project_id)
171		proj.remove_model_data()
172
173
174	View Code Duplication	@cli.command('loadvoc')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
175		@click.argument('project_id')
176		@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
177		@common_options
178		def run_loadvoc(project_id, subjectfile):
179		"""
180		Load a vocabulary for a project.
181		"""
182		proj = get_project(project_id)
183		if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
184		# SKOS/RDF file supported by rdflib
185		subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
186		else:
187		# probably a TSV file
188		subjects = annif.corpus.SubjectFileTSV(subjectfile)
189		proj.vocab.load_vocabulary(subjects, proj.language)
190
191
192	View Code Duplication	@cli.command('train')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
193		@click.argument('project_id')
194		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
195		@click.option('--cached/--no-cached', '-c/-C', default=False,
196		help='Reuse preprocessed training data from previous run')
197		@click.option('--docs-limit', '-d', default=None,
198		type=click.IntRange(0, None),
199		help='Maximum number of documents to use')
200		@click.option('--jobs',
201		'-j',
202		default=0,
203		help='Number of parallel jobs (0 means choose automatically)')
204		@backend_param_option
205		@common_options
206		def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
207		"""
208		Train a project on a collection of documents.
209		"""
210		proj = get_project(project_id)
211		backend_params = parse_backend_params(backend_param, proj)
212		if cached:
213		if len(paths) > 0:
214		raise click.UsageError(
215		"Corpus paths cannot be given when using --cached option.")
216		documents = 'cached'
217		else:
218		documents = open_documents(paths, docs_limit)
219		proj.train(documents, backend_params, jobs)
220
221
222	View Code Duplication	@cli.command('learn')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
223		@click.argument('project_id')
224		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
225		@click.option('--docs-limit', '-d', default=None,
226		type=click.IntRange(0, None),
227		help='Maximum number of documents to use')
228		@backend_param_option
229		@common_options
230		def run_learn(project_id, paths, docs_limit, backend_param):
231		"""
232		Further train an existing project on a collection of documents.
233		"""
234		proj = get_project(project_id)
235		backend_params = parse_backend_params(backend_param, proj)
236		documents = open_documents(paths, docs_limit)
237		proj.learn(documents, backend_params)
238
239
240	View Code Duplication	@cli.command('suggest')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
241		@click.argument('project_id')
242		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
243		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
244		@backend_param_option
245		@common_options
246		def run_suggest(project_id, limit, threshold, backend_param):
247		"""
248		Suggest subjects for a single document from standard input.
249		"""
250		project = get_project(project_id)
251		text = sys.stdin.read()
252		backend_params = parse_backend_params(backend_param, project)
253		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
254		hits = hit_filter(project.suggest(text, backend_params))
255		for hit in hits.as_list(project.subjects):
256		click.echo(
257		"<{}>\t{}\t{}".format(
258		hit.uri,
259		'\t'.join(filter(None, (hit.label, hit.notation))),
260		hit.score))
261
262
263	View Code Duplication	@cli.command('index')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
264		@click.argument('project_id')
265		@click.argument('directory', type=click.Path(exists=True, file_okay=False))
266		@click.option(
267		'--suffix',
268		'-s',
269		default='.annif',
270		help='File name suffix for result files')
271		@click.option('--force/--no-force', '-f/-F', default=False,
272		help='Force overwriting of existing result files')
273		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
274		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
275		@backend_param_option
276		@common_options
277		def run_index(project_id, directory, suffix, force,
278		limit, threshold, backend_param):
279		"""
280		Index a directory with documents, suggesting subjects for each document.
281		Write the results in TSV files with the given suffix.
282		"""
283		project = get_project(project_id)
284		backend_params = parse_backend_params(backend_param, project)
285		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
286
287		for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
288		directory, require_subjects=False):
289		with open(docfilename, encoding='utf-8-sig') as docfile:
290		text = docfile.read()
291		subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
292		if os.path.exists(subjectfilename) and not force:
293		click.echo(
294		"Not overwriting {} (use --force to override)".format(
295		subjectfilename))
296		continue
297		with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
298		results = project.suggest(text, backend_params)
299		for hit in hit_filter(results).as_list(project.subjects):
300		line = "<{}>\t{}\t{}".format(
301		hit.uri,
302		'\t'.join(filter(None, (hit.label, hit.notation))),
303		hit.score)
304		click.echo(line, file=subjfile)
305
306
307	View Code Duplication	@cli.command('eval')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
308		@click.argument('project_id')
309		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
310		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
311		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
312		@click.option('--docs-limit', '-d', default=None,
313		type=click.IntRange(0, None),
314		help='Maximum number of documents to use')
315		@click.option(
316		'--metrics-file',
317		'-M',
318		type=click.File(
319		'w',
320		encoding='utf-8',
321		errors='ignore',
322		lazy=True),
323		help="""Specify file in order to write evaluation metrics in JSON format.
324		File directory must exist, existing file will be overwritten.""")
325		@click.option(
326		'--results-file',
327		'-r',
328		type=click.File(
329		'w',
330		encoding='utf-8',
331		errors='ignore',
332		lazy=True),
333		help="""Specify file in order to write non-aggregated results per subject.
334		File directory must exist, existing file will be overwritten.""")
335		@click.option('--jobs',
336		'-j',
337		default=1,
338		help='Number of parallel jobs (0 means all CPUs)')
339		@backend_param_option
340		@common_options
341		def run_eval(
342		project_id,
343		paths,
344		limit,
345		threshold,
346		docs_limit,
347		metrics_file,
348		results_file,
349		jobs,
350		backend_param):
351		"""
352		Analyze documents and evaluate the result.
353
354		Compare the results of automated indexing against a gold standard. The
355		path may be either a TSV file with short documents or a directory with
356		documents in separate files.
357		"""
358
359		project = get_project(project_id)
360		backend_params = parse_backend_params(backend_param, project)
361
362		import annif.eval
363		eval_batch = annif.eval.EvaluationBatch(project.subjects)
364
365		if results_file:
366		try:
367		print('', end='', file=results_file)
368		click.echo('Writing per subject evaluation results to {!s}'.format(
369		results_file.name))
370		except Exception as e:
371		raise NotSupportedException(
372		"cannot open results-file for writing: " + str(e))
373		docs = open_documents(paths, docs_limit)
374
375		jobs, pool_class = annif.parallel.get_pool(jobs)
376
377		project.initialize(parallel=True)
378		psmap = annif.parallel.ProjectSuggestMap(
379		project.registry, [project_id], backend_params, limit, threshold)
380
381		with pool_class(jobs) as pool:
382		for hits, uris, labels in pool.imap_unordered(
383		psmap.suggest, docs.documents):
384		eval_batch.evaluate(hits[project_id],
385		annif.corpus.SubjectSet((uris, labels)))
386
387		template = "{0:<30}\t{1}"
388		metrics = eval_batch.results(results_file=results_file)
389		for metric, score in metrics.items():
390		click.echo(template.format(metric + ":", score))
391		if metrics_file:
392		print(metrics)
393		json.dump(metrics, metrics_file, indent=2)
394
395
396	View Code Duplication	@cli.command('optimize')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
397		@click.argument('project_id')
398		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
399		@click.option('--docs-limit', '-d', default=None,
400		type=click.IntRange(0, None),
401		help='Maximum number of documents to use')
402		@backend_param_option
403		@common_options
404		def run_optimize(project_id, paths, docs_limit, backend_param):
405		"""
406		Analyze documents, testing multiple limits and thresholds.
407
408		Evaluate the analysis results for a directory with documents against a
409		gold standard given in subject files. Test different limit/threshold
410		values and report the precision, recall and F-measure of each combination
411		of settings.
412		"""
413		project = get_project(project_id)
414		backend_params = parse_backend_params(backend_param, project)
415
416		filter_batches = generate_filter_batches(project.subjects)
417
418		ndocs = 0
419		docs = open_documents(paths, docs_limit)
420		for doc in docs.documents:
421		raw_hits = project.suggest(doc.text, backend_params)
422		hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
423		assert isinstance(hits, ListSuggestionResult), \
424		"Optimize should only be done with ListSuggestionResult " + \
425		"as it would be very slow with VectorSuggestionResult."
426		gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
427		for hit_filter, batch in filter_batches.values():
428		batch.evaluate(hit_filter(hits), gold_subjects)
429		ndocs += 1
430
431		click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
432
433		best_scores = collections.defaultdict(float)
434		best_params = {}
435
436		template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
437		# Store the batches in a list that gets consumed along the way
438		# This way GC will have a chance to reclaim the memory
439		filter_batches = list(filter_batches.items())
440		while filter_batches:
441		params, filter_batch = filter_batches.pop(0)
442		metrics = ['Precision (doc avg)',
443		'Recall (doc avg)',
444		'F1 score (doc avg)']
445		results = filter_batch[1].results(metrics=metrics)
446		for metric, score in results.items():
447		if score >= best_scores[metric]:
448		best_scores[metric] = score
449		best_params[metric] = params
450		click.echo(
451		template.format(
452		params[0],
453		params[1],
454		results['Precision (doc avg)'],
455		results['Recall (doc avg)'],
456		results['F1 score (doc avg)']))
457
458		click.echo()
459		template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
460		for metric in metrics:
461		click.echo(
462		template2.format(
463		metric,
464		best_scores[metric],
465		best_params[metric][0],
466		best_params[metric][1]))
467		click.echo("Documents evaluated:\t{}".format(ndocs))
468
469
470	View Code Duplication	@cli.command('hyperopt')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
471		@click.argument('project_id')
472		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
473		@click.option('--docs-limit', '-d', default=None,
474		type=click.IntRange(0, None),
475		help='Maximum number of documents to use')
476		@click.option('--trials', '-T', default=10, help='Number of trials')
477		@click.option('--jobs',
478		'-j',
479		default=1,
480		help='Number of parallel runs (0 means all CPUs)')
481		@click.option('--metric', '-m', default='NDCG',
482		help='Metric to optimize (default: NDCG)')
483		@click.option(
484		'--results-file',
485		'-r',
486		type=click.File(
487		'w',
488		encoding='utf-8',
489		errors='ignore',
490		lazy=True),
491		help="""Specify file path to write trial results as CSV.
492		File directory must exist, existing file will be overwritten.""")
493		@common_options
494		def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
495		results_file):
496		"""
497		Optimize the hyperparameters of a project using a validation corpus.
498		"""
499		proj = get_project(project_id)
500		documents = open_documents(paths, docs_limit)
501		click.echo(f"Looking for optimal hyperparameters using {trials} trials")
502		rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
503		click.echo(f"Got best {metric} score {rec.score:.4f} with:")
504		click.echo("---")
505		for line in rec.lines:
506		click.echo(line)
507		click.echo("---")
508
509
510		if __name__ == '__main__':
511		cli()
512

NatLibFi / Annif

Pull Request — master (#557)

annif.cli.common_options() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like