ocrd.processor.base - Code Metrics - Inspection of "Show resource subdir" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#777)

by Konstantin

created 2022-02-01 11:34 UTC

ocrd.processor.base C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	367
Duplicated Lines	6.54 %

Importance

Changes

Metric	Value
wmc	56
eloc	201
dl	24
loc	367
rs	5.5199
c	0
b	0
f	0

10 Methods

Rating	Name	Duplication	Size	Complexity
F	Processor.zip_input_files()	24	111	25
A	Processor.show_version()	0	2	1
A	Processor.process()	0	11	1
F	Processor.__init__()	0	105	19
A	Processor.list_all_resources()	0	5	1
A	Processor.verify()	0	5	1
A	Processor.resolve_resource()	0	20	3
A	Processor.add_metadata()	0	22	1
A	Processor.input_files()	0	25	3
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions.
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os.path import exists
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
import sys
import tarfile
import io

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    pushd_popd,
    list_all_resources,
    get_processor_resource_types
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is a tool that implements the uniform OCR-D command-line interface
    for run-time data processing. That is, it executes a single workflow step,
    or a combination of workflow steps, on the workspace (represented by local METS).
    It reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        """
        Instantiate, but do not process. Unless ``list_resources`` or
        ``show_resource`` or ``show_help`` or ``show_version`` or
        ``dump_json`` is true, setup for processing (parsing and
        validating parameters, entering the workspace directory).

        Args:
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
                 but then needs to be set before running.
        Keyword Args:
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
                 Can be ``None`` for processing, but needs to be set before running.
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                 Can be ``None`` even for processing, but then needs to be set before running.
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
                 (or empty for all pages).
             show_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its contents to stdout.
             list_resources (boolean): If true, then instead of processing, find all installed \
                 resource files in the search paths and print their path names.
             show_help (boolean): If true, then instead of processing, print a usage description \
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
                 docstrings.
             show_version (boolean): If true, then instead of processing, print information on \
                 this processor's version and OCR-D version. Exit afterwards.
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
                 on stdout.
        """
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            for res in list_all_resources(ocrd_tool['executable']):
                if Path(res).is_dir() and not has_dirs:
                    continue
                if not Path(res).is_dir() and not has_files:
                    continue
                print(res)
            return
        if show_resource:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable']))
            else:
                fpath = Path(res_fname[0])
                if fpath.is_dir():
                    with pushd_popd(fpath):
                        fileobj = io.BytesIO()
                        with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
                            tarball.add('.')
                        fileobj.seek(0)
                        copyfileobj(fileobj, sys.stdout.buffer)
                else:
                    sys.stdout.buffer.write(fpath.read_bytes())
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the :py:attr:`workspace` 
        from the given :py:attr:`input_file_grp`
        to the given :py:attr:`output_file_grp`
        for the given :py:attr:`page_id`
        under the given :py:attr:`parameter`.
        
        (This contains the main functionality and needs to be overridden by subclasses.)
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        log = getLogger('ocrd.%s.resolve_resource' % executable)
        if exists(val):
            log.debug("Resolved to absolute path %s" % val)
            return val
        ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
        if ret:
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
            return ret[0]
        log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource.",
                val, executable, executable, val)
        sys.exit(1)

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single-valued :py:attr:`input_file_grp`).

        For each physical page:

        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
        
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).

        Processors that expect/need multiple input file groups,
        cannot use :py:data:`input_files`. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via :py:attr:`mimetype`. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        :py:attr:`on_error`:

        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Keyword Args:
             require_first (boolean): If true, then skip a page entirely
                 whenever it is not available in the first input `fileGrp`.
             mimetype (string): If not `None`, filter by the specified MIME
                 type (literal or regex prefixed by `//`). Otherwise prefer
                 PAGE or image.
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                        raise ValueError(
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions.
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os.path import exists
13		from shutil import copyfileobj
14		import json
15		import os
16		from os import getcwd
17		from pathlib import Path
18		import sys
19		import tarfile
20		import io
21
22		from ocrd_utils import (
23		VERSION as OCRD_VERSION,
24		MIMETYPE_PAGE,
25		getLogger,
26		initLogging,
27		list_resource_candidates,
28		pushd_popd,
29		list_all_resources,
30		get_processor_resource_types
31		)
32		from ocrd_validators import ParameterValidator
33		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
34
35		# XXX imports must remain for backwards-compatibilty
36		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38		class Processor():
39		"""
40		A processor is a tool that implements the uniform OCR-D command-line interface
41		for run-time data processing. That is, it executes a single workflow step,
42		or a combination of workflow steps, on the workspace (represented by local METS).
43		It reads input files for all or requested physical pages of the input fileGrp(s),
44		and writes output files for them into the output fileGrp(s). It may take
45		a number of optional or mandatory parameters.
46		"""
47
48		def __init__(
49		self,
50		workspace,
51		ocrd_tool=None,
52		parameter=None,
53		# TODO OCR-D/core#274
54		# input_file_grp=None,
55		# output_file_grp=None,
56		input_file_grp="INPUT",
57		output_file_grp="OUTPUT",
58		page_id=None,
59		show_resource=None,
60		list_resources=False,
61		show_help=False,
62		show_version=False,
63		dump_json=False,
64		version=None
65		):
66		"""
67		Instantiate, but do not process. Unless ``list_resources`` or
68		``show_resource`` or ``show_help`` or ``show_version`` or
69		``dump_json`` is true, setup for processing (parsing and
70		validating parameters, entering the workspace directory).
71
72		Args:
73		workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
74		Can be ``None`` even for processing (esp. on multiple workspaces), \
75		but then needs to be set before running.
76		Keyword Args:
77		ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
78		Can be ``None`` for processing, but needs to be set before running.
79		parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
80		Can be ``None`` even for processing, but then needs to be set before running.
81		input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
82		output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
83		page_id (string): comma-separated list of METS physical ``page`` IDs to process \
84		(or empty for all pages).
85		show_resource (string): If not ``None``, then instead of processing, resolve \
86		given resource by name and print its contents to stdout.
87		list_resources (boolean): If true, then instead of processing, find all installed \
88		resource files in the search paths and print their path names.
89		show_help (boolean): If true, then instead of processing, print a usage description \
90		including the standard CLI and all of this processor's ocrd-tool parameters and \
91		docstrings.
92		show_version (boolean): If true, then instead of processing, print information on \
93		this processor's version and OCR-D version. Exit afterwards.
94		dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
95		on stdout.
96		"""
97		if parameter is None:
98		parameter = {}
99		if dump_json:
100		print(json.dumps(ocrd_tool, indent=True))
101		return
102		if list_resources:
103		has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
104		for res in list_all_resources(ocrd_tool['executable']):
105		if Path(res).is_dir() and not has_dirs:
106		continue
107		if not Path(res).is_dir() and not has_files:
108		continue
109		print(res)
110		return
111		if show_resource:
112		has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
113		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource)
114		if not res_fname:
115		initLogging()
116		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
117		logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable']))
118		else:
119		fpath = Path(res_fname[0])
120		if fpath.is_dir():
121		with pushd_popd(fpath):
122		fileobj = io.BytesIO()
123		with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
124		tarball.add('.')
125		fileobj.seek(0)
126		copyfileobj(fileobj, sys.stdout.buffer)
127		else:
128		sys.stdout.buffer.write(fpath.read_bytes())
129		return
130		self.ocrd_tool = ocrd_tool
131		if show_help:
132		self.show_help()
133		return
134		self.version = version
135		if show_version:
136		self.show_version()
137		return
138		self.workspace = workspace
139		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
140		# but there is no way to do that in process here since it's an
141		# overridden method. chdir is almost always an anti-pattern.
142		if self.workspace:
143		self.old_pwd = getcwd()
144		os.chdir(self.workspace.directory)
145		self.input_file_grp = input_file_grp
146		self.output_file_grp = output_file_grp
147		self.page_id = None if page_id == [] or page_id is None else page_id
148		parameterValidator = ParameterValidator(ocrd_tool)
149		report = parameterValidator.validate(parameter)
150		if not report.is_valid:
151		raise Exception("Invalid parameters %s" % report.errors)
152		self.parameter = parameter
153
154		def show_help(self):
155		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
156
157		def show_version(self):
158		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
159
160		def verify(self):
161		"""
162		Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
163		"""
164		return True
165
166		def process(self):
167		"""
168		Process the :py:attr:`workspace`
169		from the given :py:attr:`input_file_grp`
170		to the given :py:attr:`output_file_grp`
171		for the given :py:attr:`page_id`
172		under the given :py:attr:`parameter`.
173
174		(This contains the main functionality and needs to be overridden by subclasses.)
175		"""
176		raise Exception("Must be implemented")
177
178
179		def add_metadata(self, pcgts):
180		"""
181		Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
182		the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
183		"""
184		pcgts.get_Metadata().add_MetadataItem(
185		MetadataItemType(type_="processingStep",
186		name=self.ocrd_tool['steps'][0],
187		value=self.ocrd_tool['executable'],
188		Labels=[LabelsType(
189		externalModel="ocrd-tool",
190		externalId="parameters",
191		Label=[LabelType(type_=name,
192		value=self.parameter[name])
193		for name in self.parameter.keys()]),
194		LabelsType(
195		externalModel="ocrd-tool",
196		externalId="version",
197		Label=[LabelType(type_=self.ocrd_tool['executable'],
198		value=self.version),
199		LabelType(type_='ocrd/core',
200		value=OCRD_VERSION)])
201		]))
202
203		def resolve_resource(self, val):
204		"""
205		Resolve a resource name to an absolute file path with the algorithm in
206		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
207
208		Args:
209		val (string): resource value to resolve
210		"""
211		executable = self.ocrd_tool['executable']
212		log = getLogger('ocrd.%s.resolve_resource' % executable)
213		if exists(val):
214		log.debug("Resolved to absolute path %s" % val)
215		return val
216		ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
217		if ret:
218		log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
219		return ret[0]
220		log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource.",
221		val, executable, executable, val)
222		sys.exit(1)
223
224		def list_all_resources(self):
225		"""
226		List all resources found in the filesystem
227		"""
228		return list_all_resources(self.ocrd_tool['executable'])
229
230		@property
231		def input_files(self):
232		"""
233		List the input files (for single-valued :py:attr:`input_file_grp`).
234
235		For each physical page:
236
237		- If there is a single PAGE-XML for the page, take it (and forget about all
238		other files for that page)
239		- Else if there is a single image file, take it (and forget about all other
240		files for that page)
241		- Otherwise raise an error (complaining that only PAGE-XML warrants
242		having multiple images for a single page)
243		Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
244
245		Returns:
246		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
247		"""
248		if not self.input_file_grp:
249		raise ValueError("Processor is missing input fileGrp")
250		ret = self.zip_input_files(mimetype=None, on_error='abort')
251		if not ret:
252		return []
253		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
254		return [tuples[0] for tuples in ret]
255
256		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
257		"""
258		List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
259
260		Processors that expect/need multiple input file groups,
261		cannot use :py:data:`input_files`. They must align (zip) input files
262		across pages. This includes the case where not all pages
263		are equally present in all file groups. It also requires
264		making a consistent selection if there are multiple files
265		per page.
266
267		Following the OCR-D functional model, this function tries to
268		find a single PAGE file per page, or fall back to a single
269		image file per page. In either case, multiple matches per page
270		are an error (see error handling below).
271		This default behaviour can be changed by using a fixed MIME
272		type filter via :py:attr:`mimetype`. But still, multiple matching
273		files per page are an error.
274
275		Single-page multiple-file errors are handled according to
276		:py:attr:`on_error`:
277
278		- if ``skip``, then the page for the respective fileGrp will be
279		silently skipped (as if there was no match at all)
280		- if ``first``, then the first matching file for the page will be
281		silently selected (as if the first was the only match)
282		- if ``last``, then the last matching file for the page will be
283		silently selected (as if the last was the only match)
284		- if ``abort``, then an exception will be raised.
285		Multiple matches for PAGE-XML will always raise an exception.
286
287		Keyword Args:
288		require_first (boolean): If true, then skip a page entirely
289		whenever it is not available in the first input `fileGrp`.
290		mimetype (string): If not `None`, filter by the specified MIME
291		type (literal or regex prefixed by `//`). Otherwise prefer
292		PAGE or image.
293		Returns:
294		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
295		"""
296		if not self.input_file_grp:
297		raise ValueError("Processor is missing input fileGrp")
298
299		LOG = getLogger('ocrd.processor.base')
300		ifgs = self.input_file_grp.split(",")
301		# Iterating over all files repeatedly may seem inefficient at first sight,
302		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
303		# can actually be much more costly than traversing the ltree.
304		# This might depend on the number of pages vs number of fileGrps.
305
306		pages = dict()
307		for i, ifg in enumerate(ifgs):
308		for file_ in sorted(self.workspace.mets.find_all_files(
309		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
310		# sort by MIME type so PAGE comes before images
311		key=lambda file_: file_.mimetype):
312		if not file_.pageId:
313		continue
314		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
315		if ift[i]:
316		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
317		# fileGrp has multiple files for this page ID
318		if mimetype:
319		# filter was active, this must not happen
320	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
321		ift[i] = None
322		elif on_error == 'first':
323		pass # keep first match
324		elif on_error == 'last':
325		ift[i] = file_
326		elif on_error == 'abort':
327		raise ValueError(
328		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
329		mimetype, file_.pageId, ifg))
330		else:
331		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
332		elif (ift[i].mimetype == MIMETYPE_PAGE and
333		file_.mimetype != MIMETYPE_PAGE):
334		pass # keep PAGE match
335		elif (ift[i].mimetype == MIMETYPE_PAGE and
336		file_.mimetype == MIMETYPE_PAGE):
337		raise ValueError(
338		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
339		file_.pageId, ifg))
340		else:
341		# filter was inactive but no PAGE is in control, this must not happen
342	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
343		ift[i] = None
344		elif on_error == 'first':
345		pass # keep first match
346		elif on_error == 'last':
347		ift[i] = file_
348		elif on_error == 'abort':
349		raise ValueError(
350		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
351		file_.pageId, ifg))
352		else:
353		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
354		else:
355		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
356		ift[i] = file_
357		ifts = list()
358		for page, ifiles in pages.items():
359		for i, ifg in enumerate(ifgs):
360		if not ifiles[i]:
361		# other fallback options?
362		LOG.error('found no page %s in file group %s',
363		page, ifg)
364		if ifiles[0] or not require_first:
365		ifts.append(tuple(ifiles))
366		return ifts
367

OCR-D / core

Pull Request — master (#777)

ocrd.processor.base C

Complexity

Size/Duplication

Importance

10 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like