annif.parallel.ProjectSuggestMap.suggest() - Code Metrics - Inspection of "Batch processing in training of NN ensemble - base..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#676)

by Juho

created 2023-03-08 12:23 UTC

annif.parallel.ProjectSuggestMap.suggest() A

↳ Parent: annif.parallel.ProjectSuggestMap.suggest_batch()

Complexity

Conditions

Size

Total Lines	9
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	8
nop	2
dl	0
loc	9
rs	10
c	0
b	0
f	0

"""Parallel processing functionality for Annif"""


import multiprocessing
import multiprocessing.dummy
from collections import defaultdict

# Start method for processes created by the multiprocessing module.
# A value of None means using the platform-specific default.
# Intended to be overridden in unit tests.
MP_START_METHOD = None


class BaseWorker:
    """Base class for workers that implement tasks executed via
    multiprocessing. The init method can be used to store data objects that
    are necessary for the operation. They will be stored in a class
    attribute that is accessible to the static worker method. The storage
    solution is inspired by this blog post:
    https://thelaziestprogrammer.com/python/multiprocessing-pool-a-global-solution # noqa
    """

    args = None

    @classmethod
    def init(cls, args):
        cls.args = args  # pragma: no cover


class ProjectSuggestMap:
    """A utility class that can be used to wrap one or more projects and
    provide a mapping method that converts Document objects to suggestions.
    Intended to be used with the multiprocessing module."""

    def __init__(self, registry, project_ids, backend_params, limit, threshold):
        self.registry = registry
        self.project_ids = project_ids
        self.backend_params = backend_params
        self.limit = limit
        self.threshold = threshold

    def suggest_batch(self, batch):
        filtered_hit_sets = defaultdict(list)
        texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in batch])

        for project_id in self.project_ids:
            project = self.registry.get_project(project_id)
            hit_sets = project.suggest(texts, self.backend_params)
            for hits in hit_sets:
                filtered_hit_sets[project_id].append(
                    hits.filter(project.subjects, self.limit, self.threshold)
                )
        return (filtered_hit_sets, subject_sets)


def get_pool(n_jobs):
    """return a suitable multiprocessing pool class, and the correct jobs
    argument for its constructor, for the given amount of parallel jobs"""

    ctx = multiprocessing.get_context(MP_START_METHOD)

    if n_jobs < 1:
        n_jobs = None
        pool_class = ctx.Pool
    elif n_jobs == 1:
        # use the dummy wrapper around threading to avoid subprocess overhead
        pool_class = multiprocessing.dummy.Pool
    else:
        pool_class = ctx.Pool

    return n_jobs, pool_class


1			"""Parallel processing functionality for Annif"""
2
3
4			import multiprocessing
5			import multiprocessing.dummy
6			from collections import defaultdict
7
8			# Start method for processes created by the multiprocessing module.
9			# A value of None means using the platform-specific default.
10			# Intended to be overridden in unit tests.
11			MP_START_METHOD = None
12
13
14			class BaseWorker:
15			"""Base class for workers that implement tasks executed via
16			multiprocessing. The init method can be used to store data objects that
17			are necessary for the operation. They will be stored in a class
18			attribute that is accessible to the static worker method. The storage
19			solution is inspired by this blog post:
20			https://thelaziestprogrammer.com/python/multiprocessing-pool-a-global-solution # noqa
21			"""
22
23			args = None
24
25			@classmethod
26			def init(cls, args):
27			cls.args = args # pragma: no cover
28
29
30			class ProjectSuggestMap:
31			"""A utility class that can be used to wrap one or more projects and
32			provide a mapping method that converts Document objects to suggestions.
33			Intended to be used with the multiprocessing module."""
34
35			def __init__(self, registry, project_ids, backend_params, limit, threshold):
36			self.registry = registry
37			self.project_ids = project_ids
38			self.backend_params = backend_params
39			self.limit = limit
40			self.threshold = threshold
41
42			def suggest_batch(self, batch):
43			filtered_hit_sets = defaultdict(list)
44			texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in batch])
45
46			for project_id in self.project_ids:
47			project = self.registry.get_project(project_id)
48			hit_sets = project.suggest(texts, self.backend_params)
49			for hits in hit_sets:
50			filtered_hit_sets[project_id].append(
51			hits.filter(project.subjects, self.limit, self.threshold)
52			)
53			return (filtered_hit_sets, subject_sets)
54
55
56			def get_pool(n_jobs):
57			"""return a suitable multiprocessing pool class, and the correct jobs
58			argument for its constructor, for the given amount of parallel jobs"""
59
60			ctx = multiprocessing.get_context(MP_START_METHOD)
61
62			if n_jobs < 1:
63			n_jobs = None
64			pool_class = ctx.Pool
65			elif n_jobs == 1:
66			# use the dummy wrapper around threading to avoid subprocess overhead
67			pool_class = multiprocessing.dummy.Pool
68			else:
69			pool_class = ctx.Pool
70
71			return n_jobs, pool_class
72

NatLibFi / Annif

Pull Request — master (#676)

annif.parallel.ProjectSuggestMap.suggest() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like