annif.cli - Code Metrics - Inspection of "Output eval metrics as a JSON file compatible with..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#557)

by Osma

created 2022-01-27 13:23 UTC

annif.cli C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	514
Duplicated Lines	72.18 %

Importance

Changes

Metric	Value
eloc	383
dl	371
loc	514
rs	6.4799
c	0
b	0
f	0
wmc	54

19 Functions

Rating	Name	Duplication	Size	Complexity
A	backend_param_option()	0	6	1
A	set_project_config_file_path()	0	5	3
A	run_clear_project()	0	9	1
A	common_options()	0	8	1
A	run_show_project()	0	15	1
A	run_hyperopt()	38	38	2
A	run_loadvoc()	16	16	2
A	validate_backend_params()	0	6	2
A	open_documents()	23	23	5
A	run_train()	28	28	3
B	run_optimize()	72	72	7
A	generate_filter_batches()	0	9	3
A	run_suggest()	21	21	2
A	run_learn()	16	16	1
A	parse_backend_params()	10	10	2
A	run_list_projects()	17	17	2
B	run_index()	42	42	7
A	get_project()	0	11	2
C	run_eval()	88	88	7

How to fix Duplicated Code Complexity

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import json
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter, ListSuggestionResult
from annif.exception import ConfigurationException, NotSupportedException
from annif.util import metric_code

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message='%(version)s')(cli)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.registry.get_project(project_id,
                                          min_access=Access.private)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths, docs_limit):

    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus or
    LimitingDocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    if docs_limit is not None:
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
    return docs


def parse_backend_params(backend_param, project):

    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


BATCH_MAX_LIMIT = 15


def generate_filter_batches(subjects):
    import annif.eval
    filter_batches = collections.OrderedDict()
    for limit in range(1, BATCH_MAX_LIMIT + 1):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(subjects, limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')

@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format(
        "Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(
            min_access=Access.private).values():
        click.echo(template.format(
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f'Project ID:        {proj.project_id}')
    click.echo(f'Project Name:      {proj.name}')
    click.echo(f'Language:          {proj.language}')
    click.echo(f'Access:            {proj.access.name}')
    click.echo(f'Trained:           {proj.is_trained}')
    click.echo(f'Modification time: {proj.modification_time}')


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')

@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', '-c/-C', default=False,
              help='Reuse preprocessed training data from previous run')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--jobs',
              '-j',
              default=0,
              help='Number of parallel jobs (0 means choose automatically)')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths, docs_limit)
    proj.train(documents, backend_params, jobs)


@cli.command('learn')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths, docs_limit)
    proj.learn(documents, backend_params)


@cli.command('suggest')

@click.argument('project_id')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits.as_list(project.subjects):
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')

@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    '-s',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', '-f/-F', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option(
    '--metrics-file',
    '-M',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""")
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""")
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel jobs (0 means all CPUs)')
@backend_param_option
@common_options
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        docs_limit,
        metrics_file,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    import annif.eval
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths, docs_limit)

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits[project_id],
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(results_file=results_file)
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(
            {metric_code(metric): val for metric, val in metrics.items()},
            metrics_file, indent=2)


@cli.command('optimize')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_optimize(project_id, paths, docs_limit, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths, docs_limit)
    for doc in docs.documents:
        raw_hits = project.suggest(doc.text, backend_params)
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
        assert isinstance(hits, ListSuggestionResult), \
            "Optimize should only be done with ListSuggestionResult " + \
            "as it would be very slow with VectorSuggestionResult."
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ['Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)']
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--trials', '-T', default=10, help='Number of trials')
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel runs (0 means all CPUs)')
@click.option('--metric', '-m', default='NDCG',
              help='Metric to optimize (default: NDCG)')
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""")
@common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
                 results_file):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths, docs_limit)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == '__main__':
    cli()


1		"""Definitions for command-line (Click) commands for invoking Annif
2		operations and printing the results to console."""
3
4
5		import collections
6		import os.path
7		import re
8		import sys
9		import json
10		import click
11		import click_log
12		from flask import current_app
13		from flask.cli import FlaskGroup, ScriptInfo
14		import annif
15		import annif.corpus
16		import annif.parallel
17		import annif.project
18		import annif.registry
19		from annif.project import Access
20		from annif.suggestion import SuggestionFilter, ListSuggestionResult
21		from annif.exception import ConfigurationException, NotSupportedException
22		from annif.util import metric_code
23
24		logger = annif.logger
25		click_log.basic_config(logger)
26
27		cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
28		cli = click.version_option(message='%(version)s')(cli)
29
30
31		def get_project(project_id):
32		"""
33		Helper function to get a project by ID and bail out if it doesn't exist"""
34		try:
35		return annif.registry.get_project(project_id,
36		min_access=Access.private)
37		except ValueError:
38		click.echo(
39		"No projects found with id \'{0}\'.".format(project_id),
40		err=True)
41		sys.exit(1)
42
43
44	View Code Duplication	def open_documents(paths, docs_limit):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
45		"""Helper function to open a document corpus from a list of pathnames,
46		each of which is either a TSV file or a directory of TXT files. The
47		corpus will be returned as an instance of DocumentCorpus or
48		LimitingDocumentCorpus."""
49
50		def open_doc_path(path):
51		"""open a single path and return it as a DocumentCorpus"""
52		if os.path.isdir(path):
53		return annif.corpus.DocumentDirectory(path, require_subjects=True)
54		return annif.corpus.DocumentFile(path)
55
56		if len(paths) == 0:
57		logger.warning('Reading empty file')
58		docs = open_doc_path(os.path.devnull)
59		elif len(paths) == 1:
60		docs = open_doc_path(paths[0])
61		else:
62		corpora = [open_doc_path(path) for path in paths]
63		docs = annif.corpus.CombinedCorpus(corpora)
64		if docs_limit is not None:
65		docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
66		return docs
67
68
69	View Code Duplication	def parse_backend_params(backend_param, project):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
70		"""Parse a list of backend parameters given with the --backend-param
71		option into a nested dict structure"""
72		backend_params = collections.defaultdict(dict)
73		for beparam in backend_param:
74		backend, param = beparam.split('.', 1)
75		key, val = param.split('=', 1)
76		validate_backend_params(backend, beparam, project)
77		backend_params[backend][key] = val
78		return backend_params
79
80
81		def validate_backend_params(backend, beparam, project):
82		if backend != project.config['backend']:
83		raise ConfigurationException(
84		'The backend {} in CLI option "-b {}" not matching the project'
85		' backend {}.'
86		.format(backend, beparam, project.config['backend']))
87
88
89		BATCH_MAX_LIMIT = 15
90
91
92		def generate_filter_batches(subjects):
93		import annif.eval
94		filter_batches = collections.OrderedDict()
95		for limit in range(1, BATCH_MAX_LIMIT + 1):
96		for threshold in [i * 0.05 for i in range(20)]:
97		hit_filter = SuggestionFilter(subjects, limit, threshold)
98		batch = annif.eval.EvaluationBatch(subjects)
99		filter_batches[(limit, threshold)] = (hit_filter, batch)
100		return filter_batches
101
102
103		def set_project_config_file_path(ctx, param, value):
104		"""Override the default path or the path given in env by CLI option"""
105		with ctx.ensure_object(ScriptInfo).load_app().app_context():
106		if value:
107		current_app.config['PROJECTS_FILE'] = value
108
109
110		def common_options(f):
111		"""Decorator to add common options for all CLI commands"""
112		f = click.option(
113		'-p', '--projects', help='Set path to projects.cfg',
114		type=click.Path(dir_okay=False, exists=True),
115		callback=set_project_config_file_path, expose_value=False,
116		is_eager=True)(f)
117		return click_log.simple_verbosity_option(logger)(f)
118
119
120		def backend_param_option(f):
121		"""Decorator to add an option for CLI commands to override BE parameters"""
122		return click.option(
123		'--backend-param', '-b', multiple=True,
124		help='Override backend parameter of the config file. ' +
125		'Syntax: "-b <backend>.<parameter>=<value>".')(f)
126
127
128	View Code Duplication	@cli.command('list-projects')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
129		@common_options
130		@click_log.simple_verbosity_option(logger, default='ERROR')
131		def run_list_projects():
132		"""
133		List available projects.
134		"""
135
136		template = "{0: <25}{1: <45}{2: <10}{3: <7}"
137		header = template.format(
138		"Project ID", "Project Name", "Language", "Trained")
139		click.echo(header)
140		click.echo("-" * len(header))
141		for proj in annif.registry.get_projects(
142		min_access=Access.private).values():
143		click.echo(template.format(
144		proj.project_id, proj.name, proj.language, str(proj.is_trained)))
145
146
147		@cli.command('show-project')
148		@click.argument('project_id')
149		@common_options
150		def run_show_project(project_id):
151		"""
152		Show information about a project.
153		"""
154
155		proj = get_project(project_id)
156		click.echo(f'Project ID: {proj.project_id}')
157		click.echo(f'Project Name: {proj.name}')
158		click.echo(f'Language: {proj.language}')
159		click.echo(f'Access: {proj.access.name}')
160		click.echo(f'Trained: {proj.is_trained}')
161		click.echo(f'Modification time: {proj.modification_time}')
162
163
164		@cli.command('clear')
165		@click.argument('project_id')
166		@common_options
167		def run_clear_project(project_id):
168		"""
169		Initialize the project to its original, untrained state.
170		"""
171		proj = get_project(project_id)
172		proj.remove_model_data()
173
174
175	View Code Duplication	@cli.command('loadvoc')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
176		@click.argument('project_id')
177		@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
178		@common_options
179		def run_loadvoc(project_id, subjectfile):
180		"""
181		Load a vocabulary for a project.
182		"""
183		proj = get_project(project_id)
184		if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
185		# SKOS/RDF file supported by rdflib
186		subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
187		else:
188		# probably a TSV file
189		subjects = annif.corpus.SubjectFileTSV(subjectfile)
190		proj.vocab.load_vocabulary(subjects, proj.language)
191
192
193	View Code Duplication	@cli.command('train')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
194		@click.argument('project_id')
195		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
196		@click.option('--cached/--no-cached', '-c/-C', default=False,
197		help='Reuse preprocessed training data from previous run')
198		@click.option('--docs-limit', '-d', default=None,
199		type=click.IntRange(0, None),
200		help='Maximum number of documents to use')
201		@click.option('--jobs',
202		'-j',
203		default=0,
204		help='Number of parallel jobs (0 means choose automatically)')
205		@backend_param_option
206		@common_options
207		def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
208		"""
209		Train a project on a collection of documents.
210		"""
211		proj = get_project(project_id)
212		backend_params = parse_backend_params(backend_param, proj)
213		if cached:
214		if len(paths) > 0:
215		raise click.UsageError(
216		"Corpus paths cannot be given when using --cached option.")
217		documents = 'cached'
218		else:
219		documents = open_documents(paths, docs_limit)
220		proj.train(documents, backend_params, jobs)
221
222
223	View Code Duplication	@cli.command('learn')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
224		@click.argument('project_id')
225		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
226		@click.option('--docs-limit', '-d', default=None,
227		type=click.IntRange(0, None),
228		help='Maximum number of documents to use')
229		@backend_param_option
230		@common_options
231		def run_learn(project_id, paths, docs_limit, backend_param):
232		"""
233		Further train an existing project on a collection of documents.
234		"""
235		proj = get_project(project_id)
236		backend_params = parse_backend_params(backend_param, proj)
237		documents = open_documents(paths, docs_limit)
238		proj.learn(documents, backend_params)
239
240
241	View Code Duplication	@cli.command('suggest')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
242		@click.argument('project_id')
243		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
244		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
245		@backend_param_option
246		@common_options
247		def run_suggest(project_id, limit, threshold, backend_param):
248		"""
249		Suggest subjects for a single document from standard input.
250		"""
251		project = get_project(project_id)
252		text = sys.stdin.read()
253		backend_params = parse_backend_params(backend_param, project)
254		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
255		hits = hit_filter(project.suggest(text, backend_params))
256		for hit in hits.as_list(project.subjects):
257		click.echo(
258		"<{}>\t{}\t{}".format(
259		hit.uri,
260		'\t'.join(filter(None, (hit.label, hit.notation))),
261		hit.score))
262
263
264	View Code Duplication	@cli.command('index')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
265		@click.argument('project_id')
266		@click.argument('directory', type=click.Path(exists=True, file_okay=False))
267		@click.option(
268		'--suffix',
269		'-s',
270		default='.annif',
271		help='File name suffix for result files')
272		@click.option('--force/--no-force', '-f/-F', default=False,
273		help='Force overwriting of existing result files')
274		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
275		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
276		@backend_param_option
277		@common_options
278		def run_index(project_id, directory, suffix, force,
279		limit, threshold, backend_param):
280		"""
281		Index a directory with documents, suggesting subjects for each document.
282		Write the results in TSV files with the given suffix.
283		"""
284		project = get_project(project_id)
285		backend_params = parse_backend_params(backend_param, project)
286		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
287
288		for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
289		directory, require_subjects=False):
290		with open(docfilename, encoding='utf-8-sig') as docfile:
291		text = docfile.read()
292		subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
293		if os.path.exists(subjectfilename) and not force:
294		click.echo(
295		"Not overwriting {} (use --force to override)".format(
296		subjectfilename))
297		continue
298		with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
299		results = project.suggest(text, backend_params)
300		for hit in hit_filter(results).as_list(project.subjects):
301		line = "<{}>\t{}\t{}".format(
302		hit.uri,
303		'\t'.join(filter(None, (hit.label, hit.notation))),
304		hit.score)
305		click.echo(line, file=subjfile)
306
307
308	View Code Duplication	@cli.command('eval')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
309		@click.argument('project_id')
310		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
311		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
312		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
313		@click.option('--docs-limit', '-d', default=None,
314		type=click.IntRange(0, None),
315		help='Maximum number of documents to use')
316		@click.option(
317		'--metrics-file',
318		'-M',
319		type=click.File(
320		'w',
321		encoding='utf-8',
322		errors='ignore',
323		lazy=True),
324		help="""Specify file in order to write evaluation metrics in JSON format.
325		File directory must exist, existing file will be overwritten.""")
326		@click.option(
327		'--results-file',
328		'-r',
329		type=click.File(
330		'w',
331		encoding='utf-8',
332		errors='ignore',
333		lazy=True),
334		help="""Specify file in order to write non-aggregated results per subject.
335		File directory must exist, existing file will be overwritten.""")
336		@click.option('--jobs',
337		'-j',
338		default=1,
339		help='Number of parallel jobs (0 means all CPUs)')
340		@backend_param_option
341		@common_options
342		def run_eval(
343		project_id,
344		paths,
345		limit,
346		threshold,
347		docs_limit,
348		metrics_file,
349		results_file,
350		jobs,
351		backend_param):
352		"""
353		Analyze documents and evaluate the result.
354
355		Compare the results of automated indexing against a gold standard. The
356		path may be either a TSV file with short documents or a directory with
357		documents in separate files.
358		"""
359
360		project = get_project(project_id)
361		backend_params = parse_backend_params(backend_param, project)
362
363		import annif.eval
364		eval_batch = annif.eval.EvaluationBatch(project.subjects)
365
366		if results_file:
367		try:
368		print('', end='', file=results_file)
369		click.echo('Writing per subject evaluation results to {!s}'.format(
370		results_file.name))
371		except Exception as e:
372		raise NotSupportedException(
373		"cannot open results-file for writing: " + str(e))
374		docs = open_documents(paths, docs_limit)
375
376		jobs, pool_class = annif.parallel.get_pool(jobs)
377
378		project.initialize(parallel=True)
379		psmap = annif.parallel.ProjectSuggestMap(
380		project.registry, [project_id], backend_params, limit, threshold)
381
382		with pool_class(jobs) as pool:
383		for hits, uris, labels in pool.imap_unordered(
384		psmap.suggest, docs.documents):
385		eval_batch.evaluate(hits[project_id],
386		annif.corpus.SubjectSet((uris, labels)))
387
388		template = "{0:<30}\t{1}"
389		metrics = eval_batch.results(results_file=results_file)
390		for metric, score in metrics.items():
391		click.echo(template.format(metric + ":", score))
392		if metrics_file:
393		json.dump(
394		{metric_code(metric): val for metric, val in metrics.items()},
395		metrics_file, indent=2)
396
397
398	View Code Duplication	@cli.command('optimize')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
399		@click.argument('project_id')
400		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
401		@click.option('--docs-limit', '-d', default=None,
402		type=click.IntRange(0, None),
403		help='Maximum number of documents to use')
404		@backend_param_option
405		@common_options
406		def run_optimize(project_id, paths, docs_limit, backend_param):
407		"""
408		Analyze documents, testing multiple limits and thresholds.
409
410		Evaluate the analysis results for a directory with documents against a
411		gold standard given in subject files. Test different limit/threshold
412		values and report the precision, recall and F-measure of each combination
413		of settings.
414		"""
415		project = get_project(project_id)
416		backend_params = parse_backend_params(backend_param, project)
417
418		filter_batches = generate_filter_batches(project.subjects)
419
420		ndocs = 0
421		docs = open_documents(paths, docs_limit)
422		for doc in docs.documents:
423		raw_hits = project.suggest(doc.text, backend_params)
424		hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
425		assert isinstance(hits, ListSuggestionResult), \
426		"Optimize should only be done with ListSuggestionResult " + \
427		"as it would be very slow with VectorSuggestionResult."
428		gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
429		for hit_filter, batch in filter_batches.values():
430		batch.evaluate(hit_filter(hits), gold_subjects)
431		ndocs += 1
432
433		click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
434
435		best_scores = collections.defaultdict(float)
436		best_params = {}
437
438		template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
439		# Store the batches in a list that gets consumed along the way
440		# This way GC will have a chance to reclaim the memory
441		filter_batches = list(filter_batches.items())
442		while filter_batches:
443		params, filter_batch = filter_batches.pop(0)
444		metrics = ['Precision (doc avg)',
445		'Recall (doc avg)',
446		'F1 score (doc avg)']
447		results = filter_batch[1].results(metrics=metrics)
448		for metric, score in results.items():
449		if score >= best_scores[metric]:
450		best_scores[metric] = score
451		best_params[metric] = params
452		click.echo(
453		template.format(
454		params[0],
455		params[1],
456		results['Precision (doc avg)'],
457		results['Recall (doc avg)'],
458		results['F1 score (doc avg)']))
459
460		click.echo()
461		template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
462		for metric in metrics:
463		click.echo(
464		template2.format(
465		metric,
466		best_scores[metric],
467		best_params[metric][0],
468		best_params[metric][1]))
469		click.echo("Documents evaluated:\t{}".format(ndocs))
470
471
472	View Code Duplication	@cli.command('hyperopt')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
473		@click.argument('project_id')
474		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
475		@click.option('--docs-limit', '-d', default=None,
476		type=click.IntRange(0, None),
477		help='Maximum number of documents to use')
478		@click.option('--trials', '-T', default=10, help='Number of trials')
479		@click.option('--jobs',
480		'-j',
481		default=1,
482		help='Number of parallel runs (0 means all CPUs)')
483		@click.option('--metric', '-m', default='NDCG',
484		help='Metric to optimize (default: NDCG)')
485		@click.option(
486		'--results-file',
487		'-r',
488		type=click.File(
489		'w',
490		encoding='utf-8',
491		errors='ignore',
492		lazy=True),
493		help="""Specify file path to write trial results as CSV.
494		File directory must exist, existing file will be overwritten.""")
495		@common_options
496		def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
497		results_file):
498		"""
499		Optimize the hyperparameters of a project using a validation corpus.
500		"""
501		proj = get_project(project_id)
502		documents = open_documents(paths, docs_limit)
503		click.echo(f"Looking for optimal hyperparameters using {trials} trials")
504		rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
505		click.echo(f"Got best {metric} score {rec.score:.4f} with:")
506		click.echo("---")
507		for line in rec.lines:
508		click.echo(line)
509		click.echo("---")
510
511
512		if __name__ == '__main__':
513		cli()
514

NatLibFi / Annif

Pull Request — master (#557)

annif.cli C

Complexity

Size/Duplication

Importance

19 Functions

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like