annif.cli - Code Metrics - Inspection of "Output eval metrics as a JSON file compatible with..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#557)

by Osma

created 2022-01-27 12:50 UTC

annif.cli C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	511
Duplicated Lines	72.21 %

Importance

Changes

Metric	Value
eloc	380
dl	369
loc	511
rs	6.4799
c	0
b	0
f	0
wmc	54

19 Functions

Rating	Name	Duplication	Size	Complexity
A	backend_param_option()	0	6	1
A	set_project_config_file_path()	0	5	3
A	run_clear_project()	0	9	1
A	common_options()	0	8	1
A	run_show_project()	0	15	1
A	get_project()	0	11	2
A	validate_backend_params()	0	6	2
A	run_loadvoc()	16	16	2
A	open_documents()	23	23	5
A	run_train()	28	28	3
A	generate_filter_batches()	0	9	3
A	run_suggest()	21	21	2
A	run_learn()	16	16	1
A	parse_backend_params()	10	10	2
A	run_list_projects()	17	17	2
B	run_index()	42	42	7
A	run_hyperopt()	38	38	2
B	run_optimize()	72	72	7
C	run_eval()	86	86	7

How to fix Duplicated Code Complexity

"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""


import collections
import os.path
import re
import sys
import json
import click
import click_log
from flask import current_app
from flask.cli import FlaskGroup, ScriptInfo
import annif
import annif.corpus
import annif.parallel
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter, ListSuggestionResult
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
click_log.basic_config(logger)

cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
cli = click.version_option(message='%(version)s')(cli)


def get_project(project_id):
    """
    Helper function to get a project by ID and bail out if it doesn't exist"""
    try:
        return annif.registry.get_project(project_id,
                                          min_access=Access.private)
    except ValueError:
        click.echo(
            "No projects found with id \'{0}\'.".format(project_id),
            err=True)
        sys.exit(1)


def open_documents(paths, docs_limit):

    """Helper function to open a document corpus from a list of pathnames,
    each of which is either a TSV file or a directory of TXT files. The
    corpus will be returned as an instance of DocumentCorpus or
    LimitingDocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    if docs_limit is not None:
        docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
    return docs


def parse_backend_params(backend_param, project):

    """Parse a list of backend parameters given with the --backend-param
    option into a nested dict structure"""
    backend_params = collections.defaultdict(dict)
    for beparam in backend_param:
        backend, param = beparam.split('.', 1)
        key, val = param.split('=', 1)
        validate_backend_params(backend, beparam, project)
        backend_params[backend][key] = val
    return backend_params


def validate_backend_params(backend, beparam, project):
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'
            .format(backend, beparam, project.config['backend']))


BATCH_MAX_LIMIT = 15


def generate_filter_batches(subjects):
    import annif.eval
    filter_batches = collections.OrderedDict()
    for limit in range(1, BATCH_MAX_LIMIT + 1):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(subjects, limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches


def set_project_config_file_path(ctx, param, value):
    """Override the default path or the path given in env by CLI option"""
    with ctx.ensure_object(ScriptInfo).load_app().app_context():
        if value:
            current_app.config['PROJECTS_FILE'] = value


def common_options(f):
    """Decorator to add common options for all CLI commands"""
    f = click.option(
        '-p', '--projects', help='Set path to projects.cfg',
        type=click.Path(dir_okay=False, exists=True),
        callback=set_project_config_file_path, expose_value=False,
        is_eager=True)(f)
    return click_log.simple_verbosity_option(logger)(f)


def backend_param_option(f):
    """Decorator to add an option for CLI commands to override BE parameters"""
    return click.option(
        '--backend-param', '-b', multiple=True,
        help='Override backend parameter of the config file. ' +
        'Syntax: "-b <backend>.<parameter>=<value>".')(f)


@cli.command('list-projects')

@common_options
@click_log.simple_verbosity_option(logger, default='ERROR')
def run_list_projects():
    """
    List available projects.
    """

    template = "{0: <25}{1: <45}{2: <10}{3: <7}"
    header = template.format(
        "Project ID", "Project Name", "Language", "Trained")
    click.echo(header)
    click.echo("-" * len(header))
    for proj in annif.registry.get_projects(
            min_access=Access.private).values():
        click.echo(template.format(
            proj.project_id, proj.name, proj.language, str(proj.is_trained)))


@cli.command('show-project')
@click.argument('project_id')
@common_options
def run_show_project(project_id):
    """
    Show information about a project.
    """

    proj = get_project(project_id)
    click.echo(f'Project ID:        {proj.project_id}')
    click.echo(f'Project Name:      {proj.name}')
    click.echo(f'Language:          {proj.language}')
    click.echo(f'Access:            {proj.access.name}')
    click.echo(f'Trained:           {proj.is_trained}')
    click.echo(f'Modification time: {proj.modification_time}')


@cli.command('clear')
@click.argument('project_id')
@common_options
def run_clear_project(project_id):
    """
    Initialize the project to its original, untrained state.
    """
    proj = get_project(project_id)
    proj.remove_model_data()


@cli.command('loadvoc')

@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
    """
    Load a vocabulary for a project.
    """
    proj = get_project(project_id)
    if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
        # SKOS/RDF file supported by rdflib
        subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
    else:
        # probably a TSV file
        subjects = annif.corpus.SubjectFileTSV(subjectfile)
    proj.vocab.load_vocabulary(subjects, proj.language)


@cli.command('train')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--cached/--no-cached', '-c/-C', default=False,
              help='Reuse preprocessed training data from previous run')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--jobs',
              '-j',
              default=0,
              help='Number of parallel jobs (0 means choose automatically)')
@backend_param_option
@common_options
def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
    """
    Train a project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    if cached:
        if len(paths) > 0:
            raise click.UsageError(
                "Corpus paths cannot be given when using --cached option.")
        documents = 'cached'
    else:
        documents = open_documents(paths, docs_limit)
    proj.train(documents, backend_params, jobs)


@cli.command('learn')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_learn(project_id, paths, docs_limit, backend_param):
    """
    Further train an existing project on a collection of documents.
    """
    proj = get_project(project_id)
    backend_params = parse_backend_params(backend_param, proj)
    documents = open_documents(paths, docs_limit)
    proj.learn(documents, backend_params)


@cli.command('suggest')

@click.argument('project_id')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits.as_list(project.subjects):
        click.echo(
            "<{}>\t{}\t{}".format(
                hit.uri,
                '\t'.join(filter(None, (hit.label, hit.notation))),
                hit.score))


@cli.command('index')

@click.argument('project_id')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
    '--suffix',
    '-s',
    default='.annif',
    help='File name suffix for result files')
@click.option('--force/--no-force', '-f/-F', default=False,
              help='Force overwriting of existing result files')
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@backend_param_option
@common_options
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri,
                    '\t'.join(filter(None, (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)


@cli.command('eval')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option(
    '--metrics-file',
    '-M',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write evaluation metrics in JSON format.
    File directory must exist, existing file will be overwritten.""")
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file in order to write non-aggregated results per subject.
    File directory must exist, existing file will be overwritten.""")
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel jobs (0 means all CPUs)')
@backend_param_option
@common_options
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        docs_limit,
        metrics_file,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    import annif.eval
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths, docs_limit)

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize(parallel=True)
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits[project_id],
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    metrics = eval_batch.results(results_file=results_file)
    for metric, score in metrics.items():
        click.echo(template.format(metric + ":", score))
    if metrics_file:
        json.dump(metrics, metrics_file, indent=2)


@cli.command('optimize')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@backend_param_option
@common_options
def run_optimize(project_id, paths, docs_limit, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths, docs_limit)
    for doc in docs.documents:
        raw_hits = project.suggest(doc.text, backend_params)
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
        assert isinstance(hits, ListSuggestionResult), \
            "Optimize should only be done with ListSuggestionResult " + \
            "as it would be very slow with VectorSuggestionResult."
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = ['Precision (doc avg)',
                   'Recall (doc avg)',
                   'F1 score (doc avg)']
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(
                params[0],
                params[1],
                results['Precision (doc avg)'],
                results['Recall (doc avg)'],
                results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(
                metric,
                best_scores[metric],
                best_params[metric][0],
                best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))


@cli.command('hyperopt')

@click.argument('project_id')
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--docs-limit', '-d', default=None,
              type=click.IntRange(0, None),
              help='Maximum number of documents to use')
@click.option('--trials', '-T', default=10, help='Number of trials')
@click.option('--jobs',
              '-j',
              default=1,
              help='Number of parallel runs (0 means all CPUs)')
@click.option('--metric', '-m', default='NDCG',
              help='Metric to optimize (default: NDCG)')
@click.option(
    '--results-file',
    '-r',
    type=click.File(
        'w',
        encoding='utf-8',
        errors='ignore',
        lazy=True),
    help="""Specify file path to write trial results as CSV.
    File directory must exist, existing file will be overwritten.""")
@common_options
def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
                 results_file):
    """
    Optimize the hyperparameters of a project using a validation corpus.
    """
    proj = get_project(project_id)
    documents = open_documents(paths, docs_limit)
    click.echo(f"Looking for optimal hyperparameters using {trials} trials")
    rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
    click.echo(f"Got best {metric} score {rec.score:.4f} with:")
    click.echo("---")
    for line in rec.lines:
        click.echo(line)
    click.echo("---")


if __name__ == '__main__':
    cli()


1		"""Definitions for command-line (Click) commands for invoking Annif
2		operations and printing the results to console."""
3
4
5		import collections
6		import os.path
7		import re
8		import sys
9		import json
10		import click
11		import click_log
12		from flask import current_app
13		from flask.cli import FlaskGroup, ScriptInfo
14		import annif
15		import annif.corpus
16		import annif.parallel
17		import annif.project
18		import annif.registry
19		from annif.project import Access
20		from annif.suggestion import SuggestionFilter, ListSuggestionResult
21		from annif.exception import ConfigurationException, NotSupportedException
22
23		logger = annif.logger
24		click_log.basic_config(logger)
25
26		cli = FlaskGroup(create_app=annif.create_app, add_version_option=False)
27		cli = click.version_option(message='%(version)s')(cli)
28
29
30		def get_project(project_id):
31		"""
32		Helper function to get a project by ID and bail out if it doesn't exist"""
33		try:
34		return annif.registry.get_project(project_id,
35		min_access=Access.private)
36		except ValueError:
37		click.echo(
38		"No projects found with id \'{0}\'.".format(project_id),
39		err=True)
40		sys.exit(1)
41
42
43	View Code Duplication	def open_documents(paths, docs_limit):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
44		"""Helper function to open a document corpus from a list of pathnames,
45		each of which is either a TSV file or a directory of TXT files. The
46		corpus will be returned as an instance of DocumentCorpus or
47		LimitingDocumentCorpus."""
48
49		def open_doc_path(path):
50		"""open a single path and return it as a DocumentCorpus"""
51		if os.path.isdir(path):
52		return annif.corpus.DocumentDirectory(path, require_subjects=True)
53		return annif.corpus.DocumentFile(path)
54
55		if len(paths) == 0:
56		logger.warning('Reading empty file')
57		docs = open_doc_path(os.path.devnull)
58		elif len(paths) == 1:
59		docs = open_doc_path(paths[0])
60		else:
61		corpora = [open_doc_path(path) for path in paths]
62		docs = annif.corpus.CombinedCorpus(corpora)
63		if docs_limit is not None:
64		docs = annif.corpus.LimitingDocumentCorpus(docs, docs_limit)
65		return docs
66
67
68	View Code Duplication	def parse_backend_params(backend_param, project):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
69		"""Parse a list of backend parameters given with the --backend-param
70		option into a nested dict structure"""
71		backend_params = collections.defaultdict(dict)
72		for beparam in backend_param:
73		backend, param = beparam.split('.', 1)
74		key, val = param.split('=', 1)
75		validate_backend_params(backend, beparam, project)
76		backend_params[backend][key] = val
77		return backend_params
78
79
80		def validate_backend_params(backend, beparam, project):
81		if backend != project.config['backend']:
82		raise ConfigurationException(
83		'The backend {} in CLI option "-b {}" not matching the project'
84		' backend {}.'
85		.format(backend, beparam, project.config['backend']))
86
87
88		BATCH_MAX_LIMIT = 15
89
90
91		def generate_filter_batches(subjects):
92		import annif.eval
93		filter_batches = collections.OrderedDict()
94		for limit in range(1, BATCH_MAX_LIMIT + 1):
95		for threshold in [i * 0.05 for i in range(20)]:
96		hit_filter = SuggestionFilter(subjects, limit, threshold)
97		batch = annif.eval.EvaluationBatch(subjects)
98		filter_batches[(limit, threshold)] = (hit_filter, batch)
99		return filter_batches
100
101
102		def set_project_config_file_path(ctx, param, value):
103		"""Override the default path or the path given in env by CLI option"""
104		with ctx.ensure_object(ScriptInfo).load_app().app_context():
105		if value:
106		current_app.config['PROJECTS_FILE'] = value
107
108
109		def common_options(f):
110		"""Decorator to add common options for all CLI commands"""
111		f = click.option(
112		'-p', '--projects', help='Set path to projects.cfg',
113		type=click.Path(dir_okay=False, exists=True),
114		callback=set_project_config_file_path, expose_value=False,
115		is_eager=True)(f)
116		return click_log.simple_verbosity_option(logger)(f)
117
118
119		def backend_param_option(f):
120		"""Decorator to add an option for CLI commands to override BE parameters"""
121		return click.option(
122		'--backend-param', '-b', multiple=True,
123		help='Override backend parameter of the config file. ' +
124		'Syntax: "-b <backend>.<parameter>=<value>".')(f)
125
126
127	View Code Duplication	@cli.command('list-projects')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
128		@common_options
129		@click_log.simple_verbosity_option(logger, default='ERROR')
130		def run_list_projects():
131		"""
132		List available projects.
133		"""
134
135		template = "{0: <25}{1: <45}{2: <10}{3: <7}"
136		header = template.format(
137		"Project ID", "Project Name", "Language", "Trained")
138		click.echo(header)
139		click.echo("-" * len(header))
140		for proj in annif.registry.get_projects(
141		min_access=Access.private).values():
142		click.echo(template.format(
143		proj.project_id, proj.name, proj.language, str(proj.is_trained)))
144
145
146		@cli.command('show-project')
147		@click.argument('project_id')
148		@common_options
149		def run_show_project(project_id):
150		"""
151		Show information about a project.
152		"""
153
154		proj = get_project(project_id)
155		click.echo(f'Project ID: {proj.project_id}')
156		click.echo(f'Project Name: {proj.name}')
157		click.echo(f'Language: {proj.language}')
158		click.echo(f'Access: {proj.access.name}')
159		click.echo(f'Trained: {proj.is_trained}')
160		click.echo(f'Modification time: {proj.modification_time}')
161
162
163		@cli.command('clear')
164		@click.argument('project_id')
165		@common_options
166		def run_clear_project(project_id):
167		"""
168		Initialize the project to its original, untrained state.
169		"""
170		proj = get_project(project_id)
171		proj.remove_model_data()
172
173
174	View Code Duplication	@cli.command('loadvoc')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
175		@click.argument('project_id')
176		@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
177		@common_options
178		def run_loadvoc(project_id, subjectfile):
179		"""
180		Load a vocabulary for a project.
181		"""
182		proj = get_project(project_id)
183		if annif.corpus.SubjectFileSKOS.is_rdf_file(subjectfile):
184		# SKOS/RDF file supported by rdflib
185		subjects = annif.corpus.SubjectFileSKOS(subjectfile, proj.language)
186		else:
187		# probably a TSV file
188		subjects = annif.corpus.SubjectFileTSV(subjectfile)
189		proj.vocab.load_vocabulary(subjects, proj.language)
190
191
192	View Code Duplication	@cli.command('train')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
193		@click.argument('project_id')
194		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
195		@click.option('--cached/--no-cached', '-c/-C', default=False,
196		help='Reuse preprocessed training data from previous run')
197		@click.option('--docs-limit', '-d', default=None,
198		type=click.IntRange(0, None),
199		help='Maximum number of documents to use')
200		@click.option('--jobs',
201		'-j',
202		default=0,
203		help='Number of parallel jobs (0 means choose automatically)')
204		@backend_param_option
205		@common_options
206		def run_train(project_id, paths, cached, docs_limit, jobs, backend_param):
207		"""
208		Train a project on a collection of documents.
209		"""
210		proj = get_project(project_id)
211		backend_params = parse_backend_params(backend_param, proj)
212		if cached:
213		if len(paths) > 0:
214		raise click.UsageError(
215		"Corpus paths cannot be given when using --cached option.")
216		documents = 'cached'
217		else:
218		documents = open_documents(paths, docs_limit)
219		proj.train(documents, backend_params, jobs)
220
221
222	View Code Duplication	@cli.command('learn')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
223		@click.argument('project_id')
224		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
225		@click.option('--docs-limit', '-d', default=None,
226		type=click.IntRange(0, None),
227		help='Maximum number of documents to use')
228		@backend_param_option
229		@common_options
230		def run_learn(project_id, paths, docs_limit, backend_param):
231		"""
232		Further train an existing project on a collection of documents.
233		"""
234		proj = get_project(project_id)
235		backend_params = parse_backend_params(backend_param, proj)
236		documents = open_documents(paths, docs_limit)
237		proj.learn(documents, backend_params)
238
239
240	View Code Duplication	@cli.command('suggest')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
241		@click.argument('project_id')
242		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
243		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
244		@backend_param_option
245		@common_options
246		def run_suggest(project_id, limit, threshold, backend_param):
247		"""
248		Suggest subjects for a single document from standard input.
249		"""
250		project = get_project(project_id)
251		text = sys.stdin.read()
252		backend_params = parse_backend_params(backend_param, project)
253		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
254		hits = hit_filter(project.suggest(text, backend_params))
255		for hit in hits.as_list(project.subjects):
256		click.echo(
257		"<{}>\t{}\t{}".format(
258		hit.uri,
259		'\t'.join(filter(None, (hit.label, hit.notation))),
260		hit.score))
261
262
263	View Code Duplication	@cli.command('index')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
264		@click.argument('project_id')
265		@click.argument('directory', type=click.Path(exists=True, file_okay=False))
266		@click.option(
267		'--suffix',
268		'-s',
269		default='.annif',
270		help='File name suffix for result files')
271		@click.option('--force/--no-force', '-f/-F', default=False,
272		help='Force overwriting of existing result files')
273		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
274		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
275		@backend_param_option
276		@common_options
277		def run_index(project_id, directory, suffix, force,
278		limit, threshold, backend_param):
279		"""
280		Index a directory with documents, suggesting subjects for each document.
281		Write the results in TSV files with the given suffix.
282		"""
283		project = get_project(project_id)
284		backend_params = parse_backend_params(backend_param, project)
285		hit_filter = SuggestionFilter(project.subjects, limit, threshold)
286
287		for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
288		directory, require_subjects=False):
289		with open(docfilename, encoding='utf-8-sig') as docfile:
290		text = docfile.read()
291		subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
292		if os.path.exists(subjectfilename) and not force:
293		click.echo(
294		"Not overwriting {} (use --force to override)".format(
295		subjectfilename))
296		continue
297		with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
298		results = project.suggest(text, backend_params)
299		for hit in hit_filter(results).as_list(project.subjects):
300		line = "<{}>\t{}\t{}".format(
301		hit.uri,
302		'\t'.join(filter(None, (hit.label, hit.notation))),
303		hit.score)
304		click.echo(line, file=subjfile)
305
306
307	View Code Duplication	@cli.command('eval')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
308		@click.argument('project_id')
309		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
310		@click.option('--limit', '-l', default=10, help='Maximum number of subjects')
311		@click.option('--threshold', '-t', default=0.0, help='Minimum score threshold')
312		@click.option('--docs-limit', '-d', default=None,
313		type=click.IntRange(0, None),
314		help='Maximum number of documents to use')
315		@click.option(
316		'--metrics-file',
317		'-M',
318		type=click.File(
319		'w',
320		encoding='utf-8',
321		errors='ignore',
322		lazy=True),
323		help="""Specify file in order to write evaluation metrics in JSON format.
324		File directory must exist, existing file will be overwritten.""")
325		@click.option(
326		'--results-file',
327		'-r',
328		type=click.File(
329		'w',
330		encoding='utf-8',
331		errors='ignore',
332		lazy=True),
333		help="""Specify file in order to write non-aggregated results per subject.
334		File directory must exist, existing file will be overwritten.""")
335		@click.option('--jobs',
336		'-j',
337		default=1,
338		help='Number of parallel jobs (0 means all CPUs)')
339		@backend_param_option
340		@common_options
341		def run_eval(
342		project_id,
343		paths,
344		limit,
345		threshold,
346		docs_limit,
347		metrics_file,
348		results_file,
349		jobs,
350		backend_param):
351		"""
352		Analyze documents and evaluate the result.
353
354		Compare the results of automated indexing against a gold standard. The
355		path may be either a TSV file with short documents or a directory with
356		documents in separate files.
357		"""
358
359		project = get_project(project_id)
360		backend_params = parse_backend_params(backend_param, project)
361
362		import annif.eval
363		eval_batch = annif.eval.EvaluationBatch(project.subjects)
364
365		if results_file:
366		try:
367		print('', end='', file=results_file)
368		click.echo('Writing per subject evaluation results to {!s}'.format(
369		results_file.name))
370		except Exception as e:
371		raise NotSupportedException(
372		"cannot open results-file for writing: " + str(e))
373		docs = open_documents(paths, docs_limit)
374
375		jobs, pool_class = annif.parallel.get_pool(jobs)
376
377		project.initialize(parallel=True)
378		psmap = annif.parallel.ProjectSuggestMap(
379		project.registry, [project_id], backend_params, limit, threshold)
380
381		with pool_class(jobs) as pool:
382		for hits, uris, labels in pool.imap_unordered(
383		psmap.suggest, docs.documents):
384		eval_batch.evaluate(hits[project_id],
385		annif.corpus.SubjectSet((uris, labels)))
386
387		template = "{0:<30}\t{1}"
388		metrics = eval_batch.results(results_file=results_file)
389		for metric, score in metrics.items():
390		click.echo(template.format(metric + ":", score))
391		if metrics_file:
392		json.dump(metrics, metrics_file, indent=2)
393
394
395	View Code Duplication	@cli.command('optimize')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
396		@click.argument('project_id')
397		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
398		@click.option('--docs-limit', '-d', default=None,
399		type=click.IntRange(0, None),
400		help='Maximum number of documents to use')
401		@backend_param_option
402		@common_options
403		def run_optimize(project_id, paths, docs_limit, backend_param):
404		"""
405		Analyze documents, testing multiple limits and thresholds.
406
407		Evaluate the analysis results for a directory with documents against a
408		gold standard given in subject files. Test different limit/threshold
409		values and report the precision, recall and F-measure of each combination
410		of settings.
411		"""
412		project = get_project(project_id)
413		backend_params = parse_backend_params(backend_param, project)
414
415		filter_batches = generate_filter_batches(project.subjects)
416
417		ndocs = 0
418		docs = open_documents(paths, docs_limit)
419		for doc in docs.documents:
420		raw_hits = project.suggest(doc.text, backend_params)
421		hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
422		assert isinstance(hits, ListSuggestionResult), \
423		"Optimize should only be done with ListSuggestionResult " + \
424		"as it would be very slow with VectorSuggestionResult."
425		gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
426		for hit_filter, batch in filter_batches.values():
427		batch.evaluate(hit_filter(hits), gold_subjects)
428		ndocs += 1
429
430		click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))
431
432		best_scores = collections.defaultdict(float)
433		best_params = {}
434
435		template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
436		# Store the batches in a list that gets consumed along the way
437		# This way GC will have a chance to reclaim the memory
438		filter_batches = list(filter_batches.items())
439		while filter_batches:
440		params, filter_batch = filter_batches.pop(0)
441		metrics = ['Precision (doc avg)',
442		'Recall (doc avg)',
443		'F1 score (doc avg)']
444		results = filter_batch[1].results(metrics=metrics)
445		for metric, score in results.items():
446		if score >= best_scores[metric]:
447		best_scores[metric] = score
448		best_params[metric] = params
449		click.echo(
450		template.format(
451		params[0],
452		params[1],
453		results['Precision (doc avg)'],
454		results['Recall (doc avg)'],
455		results['F1 score (doc avg)']))
456
457		click.echo()
458		template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
459		for metric in metrics:
460		click.echo(
461		template2.format(
462		metric,
463		best_scores[metric],
464		best_params[metric][0],
465		best_params[metric][1]))
466		click.echo("Documents evaluated:\t{}".format(ndocs))
467
468
469	View Code Duplication	@cli.command('hyperopt')
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
470		@click.argument('project_id')
471		@click.argument('paths', type=click.Path(exists=True), nargs=-1)
472		@click.option('--docs-limit', '-d', default=None,
473		type=click.IntRange(0, None),
474		help='Maximum number of documents to use')
475		@click.option('--trials', '-T', default=10, help='Number of trials')
476		@click.option('--jobs',
477		'-j',
478		default=1,
479		help='Number of parallel runs (0 means all CPUs)')
480		@click.option('--metric', '-m', default='NDCG',
481		help='Metric to optimize (default: NDCG)')
482		@click.option(
483		'--results-file',
484		'-r',
485		type=click.File(
486		'w',
487		encoding='utf-8',
488		errors='ignore',
489		lazy=True),
490		help="""Specify file path to write trial results as CSV.
491		File directory must exist, existing file will be overwritten.""")
492		@common_options
493		def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric,
494		results_file):
495		"""
496		Optimize the hyperparameters of a project using a validation corpus.
497		"""
498		proj = get_project(project_id)
499		documents = open_documents(paths, docs_limit)
500		click.echo(f"Looking for optimal hyperparameters using {trials} trials")
501		rec = proj.hyperopt(documents, trials, jobs, metric, results_file)
502		click.echo(f"Got best {metric} score {rec.score:.4f} with:")
503		click.echo("---")
504		for line in rec.lines:
505		click.echo(line)
506		click.echo("---")
507
508
509		if __name__ == '__main__':
510		cli()
511

NatLibFi / Annif

Pull Request — master (#557)

annif.cli C

Complexity

Size/Duplication

Importance

19 Functions

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like