ocrd.processor.base - Code Metrics - Inspection of "Resmgr dynamic discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#800)

by Konstantin

created 2022-08-02 17:56 UTC

ocrd.processor.base F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	390
Duplicated Lines	6.15 %

Importance

Changes

Metric	Value
wmc	60
eloc	214
dl	24
loc	390
rs	3.6
c	0
b	0
f	0

12 Methods

Rating	Name	Duplication	Size	Complexity
F	Processor.zip_input_files()	24	111	25
A	Processor.process()	0	11	1
A	Processor.show_version()	0	2	1
F	Processor.__init__()	0	95	14
A	Processor.verify()	0	5	1
B	Processor.list_all_resources()	0	15	7
A	Processor.resolve_resource()	0	27	4
A	Processor.add_metadata()	0	22	1
A	Processor.input_files()	0	25	3
A	Processor.moduledir()	0	6	1
A	Processor.module()	0	6	1
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions.
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from pkg_resources import resource_filename
from os.path import exists
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
import sys
import tarfile
import io

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    MIME_TO_EXT,
    getLogger,
    initLogging,
    list_resource_candidates,
    pushd_popd,
    list_all_resources,
    get_processor_resource_types
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is a tool that implements the uniform OCR-D command-line interface
    for run-time data processing. That is, it executes a single workflow step,
    or a combination of workflow steps, on the workspace (represented by local METS).
    It reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        """
        Instantiate, but do not process. Unless ``list_resources`` or
        ``show_resource`` or ``show_help`` or ``show_version`` or
        ``dump_json`` is true, setup for processing (parsing and
        validating parameters, entering the workspace directory).

        Args:
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
                 but then needs to be set before running.
        Keyword Args:
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
                 Can be ``None`` for processing, but needs to be set before running.
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                 Can be ``None`` even for processing, but then needs to be set before running.
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
                 (or empty for all pages).
             show_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its contents to stdout.
             list_resources (boolean): If true, then instead of processing, find all installed \
                 resource files in the search paths and print their path names.
             show_help (boolean): If true, then instead of processing, print a usage description \
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
                 docstrings.
             show_version (boolean): If true, then instead of processing, print information on \
                 this processor's version and OCR-D version. Exit afterwards.
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
                 on stdout.
        """
        self.ocrd_tool = ocrd_tool
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in self.list_all_resources():
                print(res)
            return
        if show_resource:
            initLogging()
            res_fname = self.resolve_resource(show_resource)
            fpath = Path(res_fname)
            if fpath.is_dir():
                with pushd_popd(fpath):
                    fileobj = io.BytesIO()
                    with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
                        tarball.add('.')
                    fileobj.seek(0)
                    copyfileobj(fileobj, sys.stdout.buffer)
            else:
                sys.stdout.buffer.write(fpath.read_bytes())
            return
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the :py:attr:`workspace` 
        from the given :py:attr:`input_file_grp`
        to the given :py:attr:`output_file_grp`
        for the given :py:attr:`page_id`
        under the given :py:attr:`parameter`.
        
        (This contains the main functionality and needs to be overridden by subclasses.)
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        log = getLogger('ocrd.%s.resolve_resource' % executable)
        if exists(val):
            log.debug("Resolved to absolute path %s" % val)
            return val
        if hasattr(self, 'old_pwd'):
            cwd = self.old_pwd
        else:
            cwd = getcwd()
        ret = [cand for cand in list_resource_candidates(executable, val,
                                                         cwd=cwd, moduled=self.moduledir)
               if exists(cand)]
        if ret:
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
            return ret[0]
        log.error("Could not find resource '%s' for executable '%s'. "
                  "Try 'ocrd resmgr download %s %s' to download this resource.",
                  val, executable, executable, val)
        sys.exit(1)

    def list_all_resources(self):
        """
        List all resources found in the filesystem and matching content-type by filename suffix
        """
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
            res = Path(res)
            if not '*/*' in mimetypes:
                if res.is_dir() and not 'text/directory' in mimetypes:
                    continue
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
                                             for mime in mimetypes):
                    continue
            yield res

    @property
    def module(self):
        """
        The top-level module this processor belongs to.
        """
        return self.__module__.split('.')[0]

    @property
    def moduledir(self):
        """
        The filesystem path of the module directory.
        """
        return resource_filename(self.module, '')

    @property
    def input_files(self):
        """
        List the input files (for single-valued :py:attr:`input_file_grp`).

        For each physical page:

        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
        
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).

        Processors that expect/need multiple input file groups,
        cannot use :py:data:`input_files`. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via :py:attr:`mimetype`. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        :py:attr:`on_error`:

        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Keyword Args:
             require_first (boolean): If true, then skip a page entirely
                 whenever it is not available in the first input `fileGrp`.
             mimetype (string): If not `None`, filter by the specified MIME
                 type (literal or regex prefixed by `//`). Otherwise prefer
                 PAGE or image.
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                        raise ValueError(
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions.
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from pkg_resources import resource_filename
13		from os.path import exists
14		from shutil import copyfileobj
15		import json
16		import os
17		from os import getcwd
18		from pathlib import Path
19		import sys
20		import tarfile
21		import io
22
23		from ocrd_utils import (
24		VERSION as OCRD_VERSION,
25		MIMETYPE_PAGE,
26		MIME_TO_EXT,
27		getLogger,
28		initLogging,
29		list_resource_candidates,
30		pushd_popd,
31		list_all_resources,
32		get_processor_resource_types
33		)
34		from ocrd_validators import ParameterValidator
35		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
36
37		# XXX imports must remain for backwards-compatibilty
38		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
39
40		class Processor():
41		"""
42		A processor is a tool that implements the uniform OCR-D command-line interface
43		for run-time data processing. That is, it executes a single workflow step,
44		or a combination of workflow steps, on the workspace (represented by local METS).
45		It reads input files for all or requested physical pages of the input fileGrp(s),
46		and writes output files for them into the output fileGrp(s). It may take
47		a number of optional or mandatory parameters.
48		"""
49
50		def __init__(
51		self,
52		workspace,
53		ocrd_tool=None,
54		parameter=None,
55		# TODO OCR-D/core#274
56		# input_file_grp=None,
57		# output_file_grp=None,
58		input_file_grp="INPUT",
59		output_file_grp="OUTPUT",
60		page_id=None,
61		show_resource=None,
62		list_resources=False,
63		show_help=False,
64		show_version=False,
65		dump_json=False,
66		version=None
67		):
68		"""
69		Instantiate, but do not process. Unless ``list_resources`` or
70		``show_resource`` or ``show_help`` or ``show_version`` or
71		``dump_json`` is true, setup for processing (parsing and
72		validating parameters, entering the workspace directory).
73
74		Args:
75		workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
76		Can be ``None`` even for processing (esp. on multiple workspaces), \
77		but then needs to be set before running.
78		Keyword Args:
79		ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
80		Can be ``None`` for processing, but needs to be set before running.
81		parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
82		Can be ``None`` even for processing, but then needs to be set before running.
83		input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
84		output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
85		page_id (string): comma-separated list of METS physical ``page`` IDs to process \
86		(or empty for all pages).
87		show_resource (string): If not ``None``, then instead of processing, resolve \
88		given resource by name and print its contents to stdout.
89		list_resources (boolean): If true, then instead of processing, find all installed \
90		resource files in the search paths and print their path names.
91		show_help (boolean): If true, then instead of processing, print a usage description \
92		including the standard CLI and all of this processor's ocrd-tool parameters and \
93		docstrings.
94		show_version (boolean): If true, then instead of processing, print information on \
95		this processor's version and OCR-D version. Exit afterwards.
96		dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
97		on stdout.
98		"""
99		self.ocrd_tool = ocrd_tool
100		if parameter is None:
101		parameter = {}
102		if dump_json:
103		print(json.dumps(ocrd_tool, indent=True))
104		return
105		if list_resources:
106		for res in self.list_all_resources():
107		print(res)
108		return
109		if show_resource:
110		initLogging()
111		res_fname = self.resolve_resource(show_resource)
112		fpath = Path(res_fname)
113		if fpath.is_dir():
114		with pushd_popd(fpath):
115		fileobj = io.BytesIO()
116		with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
117		tarball.add('.')
118		fileobj.seek(0)
119		copyfileobj(fileobj, sys.stdout.buffer)
120		else:
121		sys.stdout.buffer.write(fpath.read_bytes())
122		return
123		if show_help:
124		self.show_help()
125		return
126		self.version = version
127		if show_version:
128		self.show_version()
129		return
130		self.workspace = workspace
131		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
132		# but there is no way to do that in process here since it's an
133		# overridden method. chdir is almost always an anti-pattern.
134		if self.workspace:
135		self.old_pwd = getcwd()
136		os.chdir(self.workspace.directory)
137		self.input_file_grp = input_file_grp
138		self.output_file_grp = output_file_grp
139		self.page_id = None if page_id == [] or page_id is None else page_id
140		parameterValidator = ParameterValidator(ocrd_tool)
141		report = parameterValidator.validate(parameter)
142		if not report.is_valid:
143		raise Exception("Invalid parameters %s" % report.errors)
144		self.parameter = parameter
145
146		def show_help(self):
147		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
148
149		def show_version(self):
150		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
151
152		def verify(self):
153		"""
154		Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
155		"""
156		return True
157
158		def process(self):
159		"""
160		Process the :py:attr:`workspace`
161		from the given :py:attr:`input_file_grp`
162		to the given :py:attr:`output_file_grp`
163		for the given :py:attr:`page_id`
164		under the given :py:attr:`parameter`.
165
166		(This contains the main functionality and needs to be overridden by subclasses.)
167		"""
168		raise Exception("Must be implemented")
169
170
171		def add_metadata(self, pcgts):
172		"""
173		Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
174		the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
175		"""
176		pcgts.get_Metadata().add_MetadataItem(
177		MetadataItemType(type_="processingStep",
178		name=self.ocrd_tool['steps'][0],
179		value=self.ocrd_tool['executable'],
180		Labels=[LabelsType(
181		externalModel="ocrd-tool",
182		externalId="parameters",
183		Label=[LabelType(type_=name,
184		value=self.parameter[name])
185		for name in self.parameter.keys()]),
186		LabelsType(
187		externalModel="ocrd-tool",
188		externalId="version",
189		Label=[LabelType(type_=self.ocrd_tool['executable'],
190		value=self.version),
191		LabelType(type_='ocrd/core',
192		value=OCRD_VERSION)])
193		]))
194
195		def resolve_resource(self, val):
196		"""
197		Resolve a resource name to an absolute file path with the algorithm in
198		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
199
200		Args:
201		val (string): resource value to resolve
202		"""
203		executable = self.ocrd_tool['executable']
204		log = getLogger('ocrd.%s.resolve_resource' % executable)
205		if exists(val):
206		log.debug("Resolved to absolute path %s" % val)
207		return val
208		if hasattr(self, 'old_pwd'):
209		cwd = self.old_pwd
210		else:
211		cwd = getcwd()
212		ret = [cand for cand in list_resource_candidates(executable, val,
213		cwd=cwd, moduled=self.moduledir)
214		if exists(cand)]
215		if ret:
216		log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
217		return ret[0]
218		log.error("Could not find resource '%s' for executable '%s'. "
219		"Try 'ocrd resmgr download %s %s' to download this resource.",
220		val, executable, executable, val)
221		sys.exit(1)
222
223		def list_all_resources(self):
224		"""
225		List all resources found in the filesystem and matching content-type by filename suffix
226		"""
227		mimetypes = get_processor_resource_types(None, self.ocrd_tool)
228		for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
229		res = Path(res)
230		if not '/' in mimetypes:
231		if res.is_dir() and not 'text/directory' in mimetypes:
232		continue
233		# if we do not know all MIME types, then keep the file, otherwise require suffix match
234		if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
235		for mime in mimetypes):
236		continue
237		yield res
238
239		@property
240		def module(self):
241		"""
242		The top-level module this processor belongs to.
243		"""
244		return self.__module__.split('.')[0]
245
246		@property
247		def moduledir(self):
248		"""
249		The filesystem path of the module directory.
250		"""
251		return resource_filename(self.module, '')
252
253		@property
254		def input_files(self):
255		"""
256		List the input files (for single-valued :py:attr:`input_file_grp`).
257
258		For each physical page:
259
260		- If there is a single PAGE-XML for the page, take it (and forget about all
261		other files for that page)
262		- Else if there is a single image file, take it (and forget about all other
263		files for that page)
264		- Otherwise raise an error (complaining that only PAGE-XML warrants
265		having multiple images for a single page)
266		Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
267
268		Returns:
269		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
270		"""
271		if not self.input_file_grp:
272		raise ValueError("Processor is missing input fileGrp")
273		ret = self.zip_input_files(mimetype=None, on_error='abort')
274		if not ret:
275		return []
276		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
277		return [tuples[0] for tuples in ret]
278
279		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
280		"""
281		List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
282
283		Processors that expect/need multiple input file groups,
284		cannot use :py:data:`input_files`. They must align (zip) input files
285		across pages. This includes the case where not all pages
286		are equally present in all file groups. It also requires
287		making a consistent selection if there are multiple files
288		per page.
289
290		Following the OCR-D functional model, this function tries to
291		find a single PAGE file per page, or fall back to a single
292		image file per page. In either case, multiple matches per page
293		are an error (see error handling below).
294		This default behaviour can be changed by using a fixed MIME
295		type filter via :py:attr:`mimetype`. But still, multiple matching
296		files per page are an error.
297
298		Single-page multiple-file errors are handled according to
299		:py:attr:`on_error`:
300
301		- if ``skip``, then the page for the respective fileGrp will be
302		silently skipped (as if there was no match at all)
303		- if ``first``, then the first matching file for the page will be
304		silently selected (as if the first was the only match)
305		- if ``last``, then the last matching file for the page will be
306		silently selected (as if the last was the only match)
307		- if ``abort``, then an exception will be raised.
308		Multiple matches for PAGE-XML will always raise an exception.
309
310		Keyword Args:
311		require_first (boolean): If true, then skip a page entirely
312		whenever it is not available in the first input `fileGrp`.
313		mimetype (string): If not `None`, filter by the specified MIME
314		type (literal or regex prefixed by `//`). Otherwise prefer
315		PAGE or image.
316		Returns:
317		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
318		"""
319		if not self.input_file_grp:
320		raise ValueError("Processor is missing input fileGrp")
321
322		LOG = getLogger('ocrd.processor.base')
323		ifgs = self.input_file_grp.split(",")
324		# Iterating over all files repeatedly may seem inefficient at first sight,
325		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
326		# can actually be much more costly than traversing the ltree.
327		# This might depend on the number of pages vs number of fileGrps.
328
329		pages = dict()
330		for i, ifg in enumerate(ifgs):
331		for file_ in sorted(self.workspace.mets.find_all_files(
332		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
333		# sort by MIME type so PAGE comes before images
334		key=lambda file_: file_.mimetype):
335		if not file_.pageId:
336		continue
337		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
338		if ift[i]:
339		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
340		# fileGrp has multiple files for this page ID
341		if mimetype:
342		# filter was active, this must not happen
343	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
344		ift[i] = None
345		elif on_error == 'first':
346		pass # keep first match
347		elif on_error == 'last':
348		ift[i] = file_
349		elif on_error == 'abort':
350		raise ValueError(
351		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
352		mimetype, file_.pageId, ifg))
353		else:
354		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
355		elif (ift[i].mimetype == MIMETYPE_PAGE and
356		file_.mimetype != MIMETYPE_PAGE):
357		pass # keep PAGE match
358		elif (ift[i].mimetype == MIMETYPE_PAGE and
359		file_.mimetype == MIMETYPE_PAGE):
360		raise ValueError(
361		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
362		file_.pageId, ifg))
363		else:
364		# filter was inactive but no PAGE is in control, this must not happen
365	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
366		ift[i] = None
367		elif on_error == 'first':
368		pass # keep first match
369		elif on_error == 'last':
370		ift[i] = file_
371		elif on_error == 'abort':
372		raise ValueError(
373		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
374		file_.pageId, ifg))
375		else:
376		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
377		else:
378		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
379		ift[i] = file_
380		ifts = list()
381		for page, ifiles in pages.items():
382		for i, ifg in enumerate(ifgs):
383		if not ifiles[i]:
384		# other fallback options?
385		LOG.error('found no page %s in file group %s',
386		page, ifg)
387		if ifiles[0] or not require_first:
388		ifts.append(tuple(ifiles))
389		return ifts
390

OCR-D / core

Pull Request — master (#800)

ocrd.processor.base F

Complexity

Size/Duplication

Importance

12 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like