ocrd.processor.base - Code Metrics - Inspection of "Processor.resolve_resource: support on-demand down..." - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#799)

by Konstantin

created 2022-02-14 09:27 UTC

ocrd.processor.base D

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	387
Duplicated Lines	6.2 %

Importance

Changes

Metric	Value
wmc	59
eloc	218
dl	24
loc	387
rs	4.08
c	0
b	0
f	0

10 Methods

Rating	Name	Duplication	Size	Complexity
F	Processor.zip_input_files()	24	111	25
A	Processor.show_version()	0	2	1
A	Processor.process()	0	11	1
F	Processor.__init__()	0	105	19
A	Processor.list_all_resources()	0	5	1
A	Processor.verify()	0	5	1
B	Processor.resolve_resource()	0	38	6
A	Processor.add_metadata()	0	22	1
A	Processor.input_files()	0	25	3
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions.
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os.path import exists
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
import sys
import tarfile
import io

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    nth_url_segment,
    pushd_popd,
    list_all_resources,
    get_processor_resource_types
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
from ocrd.resource_manager import OcrdResourceManager

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is a tool that implements the uniform OCR-D command-line interface
    for run-time data processing. That is, it executes a single workflow step,
    or a combination of workflow steps, on the workspace (represented by local METS).
    It reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        """
        Instantiate, but do not process. Unless ``list_resources`` or
        ``show_resource`` or ``show_help`` or ``show_version`` or
        ``dump_json`` is true, setup for processing (parsing and
        validating parameters, entering the workspace directory).

        Args:
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
                 but then needs to be set before running.
        Keyword Args:
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
                 Can be ``None`` for processing, but needs to be set before running.
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                 Can be ``None`` even for processing, but then needs to be set before running.
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
                 (or empty for all pages).
             show_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its contents to stdout.
             list_resources (boolean): If true, then instead of processing, find all installed \
                 resource files in the search paths and print their path names.
             show_help (boolean): If true, then instead of processing, print a usage description \
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
                 docstrings.
             show_version (boolean): If true, then instead of processing, print information on \
                 this processor's version and OCR-D version. Exit afterwards.
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
                 on stdout.
        """
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            for res in list_all_resources(ocrd_tool['executable']):
                if Path(res).is_dir() and not has_dirs:
                    continue
                if not Path(res).is_dir() and not has_files:
                    continue
                print(res)
            return
        if show_resource:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable']))
            else:
                fpath = Path(res_fname[0])
                if fpath.is_dir():
                    with pushd_popd(fpath):
                        fileobj = io.BytesIO()
                        with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
                            tarball.add('.')
                        fileobj.seek(0)
                        copyfileobj(fileobj, sys.stdout.buffer)
                else:
                    sys.stdout.buffer.write(fpath.read_bytes())
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the :py:attr:`workspace` 
        from the given :py:attr:`input_file_grp`
        to the given :py:attr:`output_file_grp`
        for the given :py:attr:`page_id`
        under the given :py:attr:`parameter`.
        
        (This contains the main functionality and needs to be overridden by subclasses.)
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        log = getLogger('ocrd.%s.resolve_resource' % executable)
        if exists(val):
            log.debug("Resolved to absolute path %s" % val)
            return val
        ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
        if ret:
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
            return ret[0]
        elif (val.startswith('http://') or val.startswith('https://')):
            resmgr = OcrdResourceManager()
            reslist = resmgr.find_resources(executable, url=val)
            if reslist:
                _, resdict = reslist[0]
                log.info("Found registered resource for %s: '%s' (%s)." % (executable, val, resdict))
            else:
                resdict = {}
                log.info("Not a registered resource for %s: '%s'." % (executable, val))
            return str(resmgr.download(
                executable,
                val,
                basedir = resmgr.location_to_resource_dir('data'),
                name=resdict.get('name', nth_url_segment(val)),
                path_in_archive=resdict.get('path_in_archive', '.'),
                resource_type=resdict.get('type', 'file')
                ))
        else:
            log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource or use a URL for the parameter value.",
                val, executable, executable, val)
            sys.exit(1)

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single-valued :py:attr:`input_file_grp`).

        For each physical page:

        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
        
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).

        Processors that expect/need multiple input file groups,
        cannot use :py:data:`input_files`. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via :py:attr:`mimetype`. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        :py:attr:`on_error`:

        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Keyword Args:
             require_first (boolean): If true, then skip a page entirely
                 whenever it is not available in the first input `fileGrp`.
             mimetype (string): If not `None`, filter by the specified MIME
                 type (literal or regex prefixed by `//`). Otherwise prefer
                 PAGE or image.
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                        raise ValueError(
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions.
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os.path import exists
13		from shutil import copyfileobj
14		import json
15		import os
16		from os import getcwd
17		from pathlib import Path
18		import sys
19		import tarfile
20		import io
21
22		from ocrd_utils import (
23		VERSION as OCRD_VERSION,
24		MIMETYPE_PAGE,
25		getLogger,
26		initLogging,
27		list_resource_candidates,
28		nth_url_segment,
29		pushd_popd,
30		list_all_resources,
31		get_processor_resource_types
32		)
33		from ocrd_validators import ParameterValidator
34		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
35		from ocrd.resource_manager import OcrdResourceManager
36
37		# XXX imports must remain for backwards-compatibilty
38		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
39
40		class Processor():
41		"""
42		A processor is a tool that implements the uniform OCR-D command-line interface
43		for run-time data processing. That is, it executes a single workflow step,
44		or a combination of workflow steps, on the workspace (represented by local METS).
45		It reads input files for all or requested physical pages of the input fileGrp(s),
46		and writes output files for them into the output fileGrp(s). It may take
47		a number of optional or mandatory parameters.
48		"""
49
50		def __init__(
51		self,
52		workspace,
53		ocrd_tool=None,
54		parameter=None,
55		# TODO OCR-D/core#274
56		# input_file_grp=None,
57		# output_file_grp=None,
58		input_file_grp="INPUT",
59		output_file_grp="OUTPUT",
60		page_id=None,
61		show_resource=None,
62		list_resources=False,
63		show_help=False,
64		show_version=False,
65		dump_json=False,
66		version=None
67		):
68		"""
69		Instantiate, but do not process. Unless ``list_resources`` or
70		``show_resource`` or ``show_help`` or ``show_version`` or
71		``dump_json`` is true, setup for processing (parsing and
72		validating parameters, entering the workspace directory).
73
74		Args:
75		workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
76		Can be ``None`` even for processing (esp. on multiple workspaces), \
77		but then needs to be set before running.
78		Keyword Args:
79		ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
80		Can be ``None`` for processing, but needs to be set before running.
81		parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
82		Can be ``None`` even for processing, but then needs to be set before running.
83		input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
84		output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
85		page_id (string): comma-separated list of METS physical ``page`` IDs to process \
86		(or empty for all pages).
87		show_resource (string): If not ``None``, then instead of processing, resolve \
88		given resource by name and print its contents to stdout.
89		list_resources (boolean): If true, then instead of processing, find all installed \
90		resource files in the search paths and print their path names.
91		show_help (boolean): If true, then instead of processing, print a usage description \
92		including the standard CLI and all of this processor's ocrd-tool parameters and \
93		docstrings.
94		show_version (boolean): If true, then instead of processing, print information on \
95		this processor's version and OCR-D version. Exit afterwards.
96		dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
97		on stdout.
98		"""
99		if parameter is None:
100		parameter = {}
101		if dump_json:
102		print(json.dumps(ocrd_tool, indent=True))
103		return
104		if list_resources:
105		has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
106		for res in list_all_resources(ocrd_tool['executable']):
107		if Path(res).is_dir() and not has_dirs:
108		continue
109		if not Path(res).is_dir() and not has_files:
110		continue
111		print(res)
112		return
113		if show_resource:
114		has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
115		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource)
116		if not res_fname:
117		initLogging()
118		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
119		logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable']))
120		else:
121		fpath = Path(res_fname[0])
122		if fpath.is_dir():
123		with pushd_popd(fpath):
124		fileobj = io.BytesIO()
125		with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
126		tarball.add('.')
127		fileobj.seek(0)
128		copyfileobj(fileobj, sys.stdout.buffer)
129		else:
130		sys.stdout.buffer.write(fpath.read_bytes())
131		return
132		self.ocrd_tool = ocrd_tool
133		if show_help:
134		self.show_help()
135		return
136		self.version = version
137		if show_version:
138		self.show_version()
139		return
140		self.workspace = workspace
141		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
142		# but there is no way to do that in process here since it's an
143		# overridden method. chdir is almost always an anti-pattern.
144		if self.workspace:
145		self.old_pwd = getcwd()
146		os.chdir(self.workspace.directory)
147		self.input_file_grp = input_file_grp
148		self.output_file_grp = output_file_grp
149		self.page_id = None if page_id == [] or page_id is None else page_id
150		parameterValidator = ParameterValidator(ocrd_tool)
151		report = parameterValidator.validate(parameter)
152		if not report.is_valid:
153		raise Exception("Invalid parameters %s" % report.errors)
154		self.parameter = parameter
155
156		def show_help(self):
157		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
158
159		def show_version(self):
160		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
161
162		def verify(self):
163		"""
164		Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
165		"""
166		return True
167
168		def process(self):
169		"""
170		Process the :py:attr:`workspace`
171		from the given :py:attr:`input_file_grp`
172		to the given :py:attr:`output_file_grp`
173		for the given :py:attr:`page_id`
174		under the given :py:attr:`parameter`.
175
176		(This contains the main functionality and needs to be overridden by subclasses.)
177		"""
178		raise Exception("Must be implemented")
179
180
181		def add_metadata(self, pcgts):
182		"""
183		Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
184		the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
185		"""
186		pcgts.get_Metadata().add_MetadataItem(
187		MetadataItemType(type_="processingStep",
188		name=self.ocrd_tool['steps'][0],
189		value=self.ocrd_tool['executable'],
190		Labels=[LabelsType(
191		externalModel="ocrd-tool",
192		externalId="parameters",
193		Label=[LabelType(type_=name,
194		value=self.parameter[name])
195		for name in self.parameter.keys()]),
196		LabelsType(
197		externalModel="ocrd-tool",
198		externalId="version",
199		Label=[LabelType(type_=self.ocrd_tool['executable'],
200		value=self.version),
201		LabelType(type_='ocrd/core',
202		value=OCRD_VERSION)])
203		]))
204
205		def resolve_resource(self, val):
206		"""
207		Resolve a resource name to an absolute file path with the algorithm in
208		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
209
210		Args:
211		val (string): resource value to resolve
212		"""
213		executable = self.ocrd_tool['executable']
214		log = getLogger('ocrd.%s.resolve_resource' % executable)
215		if exists(val):
216		log.debug("Resolved to absolute path %s" % val)
217		return val
218		ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
219		if ret:
220		log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
221		return ret[0]
222		elif (val.startswith('http://') or val.startswith('https://')):
223		resmgr = OcrdResourceManager()
224		reslist = resmgr.find_resources(executable, url=val)
225		if reslist:
226		_, resdict = reslist[0]
227		log.info("Found registered resource for %s: '%s' (%s)." % (executable, val, resdict))
228		else:
229		resdict = {}
230		log.info("Not a registered resource for %s: '%s'." % (executable, val))
231		return str(resmgr.download(
232		executable,
233		val,
234		basedir = resmgr.location_to_resource_dir('data'),
235		name=resdict.get('name', nth_url_segment(val)),
236		path_in_archive=resdict.get('path_in_archive', '.'),
237		resource_type=resdict.get('type', 'file')
238		))
239		else:
240		log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource or use a URL for the parameter value.",
241		val, executable, executable, val)
242		sys.exit(1)
243
244		def list_all_resources(self):
245		"""
246		List all resources found in the filesystem
247		"""
248		return list_all_resources(self.ocrd_tool['executable'])
249
250		@property
251		def input_files(self):
252		"""
253		List the input files (for single-valued :py:attr:`input_file_grp`).
254
255		For each physical page:
256
257		- If there is a single PAGE-XML for the page, take it (and forget about all
258		other files for that page)
259		- Else if there is a single image file, take it (and forget about all other
260		files for that page)
261		- Otherwise raise an error (complaining that only PAGE-XML warrants
262		having multiple images for a single page)
263		Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
264
265		Returns:
266		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
267		"""
268		if not self.input_file_grp:
269		raise ValueError("Processor is missing input fileGrp")
270		ret = self.zip_input_files(mimetype=None, on_error='abort')
271		if not ret:
272		return []
273		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
274		return [tuples[0] for tuples in ret]
275
276		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
277		"""
278		List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
279
280		Processors that expect/need multiple input file groups,
281		cannot use :py:data:`input_files`. They must align (zip) input files
282		across pages. This includes the case where not all pages
283		are equally present in all file groups. It also requires
284		making a consistent selection if there are multiple files
285		per page.
286
287		Following the OCR-D functional model, this function tries to
288		find a single PAGE file per page, or fall back to a single
289		image file per page. In either case, multiple matches per page
290		are an error (see error handling below).
291		This default behaviour can be changed by using a fixed MIME
292		type filter via :py:attr:`mimetype`. But still, multiple matching
293		files per page are an error.
294
295		Single-page multiple-file errors are handled according to
296		:py:attr:`on_error`:
297
298		- if ``skip``, then the page for the respective fileGrp will be
299		silently skipped (as if there was no match at all)
300		- if ``first``, then the first matching file for the page will be
301		silently selected (as if the first was the only match)
302		- if ``last``, then the last matching file for the page will be
303		silently selected (as if the last was the only match)
304		- if ``abort``, then an exception will be raised.
305		Multiple matches for PAGE-XML will always raise an exception.
306
307		Keyword Args:
308		require_first (boolean): If true, then skip a page entirely
309		whenever it is not available in the first input `fileGrp`.
310		mimetype (string): If not `None`, filter by the specified MIME
311		type (literal or regex prefixed by `//`). Otherwise prefer
312		PAGE or image.
313		Returns:
314		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
315		"""
316		if not self.input_file_grp:
317		raise ValueError("Processor is missing input fileGrp")
318
319		LOG = getLogger('ocrd.processor.base')
320		ifgs = self.input_file_grp.split(",")
321		# Iterating over all files repeatedly may seem inefficient at first sight,
322		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
323		# can actually be much more costly than traversing the ltree.
324		# This might depend on the number of pages vs number of fileGrps.
325
326		pages = dict()
327		for i, ifg in enumerate(ifgs):
328		for file_ in sorted(self.workspace.mets.find_all_files(
329		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
330		# sort by MIME type so PAGE comes before images
331		key=lambda file_: file_.mimetype):
332		if not file_.pageId:
333		continue
334		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
335		if ift[i]:
336		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
337		# fileGrp has multiple files for this page ID
338		if mimetype:
339		# filter was active, this must not happen
340	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
341		ift[i] = None
342		elif on_error == 'first':
343		pass # keep first match
344		elif on_error == 'last':
345		ift[i] = file_
346		elif on_error == 'abort':
347		raise ValueError(
348		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
349		mimetype, file_.pageId, ifg))
350		else:
351		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
352		elif (ift[i].mimetype == MIMETYPE_PAGE and
353		file_.mimetype != MIMETYPE_PAGE):
354		pass # keep PAGE match
355		elif (ift[i].mimetype == MIMETYPE_PAGE and
356		file_.mimetype == MIMETYPE_PAGE):
357		raise ValueError(
358		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
359		file_.pageId, ifg))
360		else:
361		# filter was inactive but no PAGE is in control, this must not happen
362	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
363		ift[i] = None
364		elif on_error == 'first':
365		pass # keep first match
366		elif on_error == 'last':
367		ift[i] = file_
368		elif on_error == 'abort':
369		raise ValueError(
370		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
371		file_.pageId, ifg))
372		else:
373		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
374		else:
375		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
376		ift[i] = file_
377		ifts = list()
378		for page, ifiles in pages.items():
379		for i, ifg in enumerate(ifgs):
380		if not ifiles[i]:
381		# other fallback options?
382		LOG.error('found no page %s in file group %s',
383		page, ifg)
384		if ifiles[0] or not require_first:
385		ifts.append(tuple(ifiles))
386		return ifts
387

OCR-D / core

Pull Request — master (#799)

ocrd.processor.base D

Complexity

Size/Duplication

Importance

10 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like