ocrd.processor.base - Code Metrics - Inspection of ":package: v2.67.0" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 648be5...719bbc )

by Konstantin

created 2024-07-16 17:41 UTC

ocrd.processor.base F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	444
Duplicated Lines	5.41 %

Importance

Changes

Metric	Value
wmc	72
eloc	255
dl	24
loc	444
rs	2.64
c	0
b	0
f	0

14 Methods

Rating	Name	Duplication	Size	Complexity
A	ResourceNotFoundError.__init__()	0	7	1
F	Processor.zip_input_files()	24	120	28
A	Processor.process()	0	11	1
A	Processor.show_version()	0	2	1
A	Processor.show_resource()	0	12	4
F	Processor.__init__()	0	106	15
A	Processor.verify()	0	5	1
B	Processor.list_all_resources()	0	15	7
A	Processor.resolve_resource()	0	25	4
A	Processor.add_metadata()	0	22	1
A	Processor.input_files()	0	25	3
A	Processor.moduledir()	0	6	1
A	Processor.module()	0	15	4
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions.
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os.path import exists
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
import sys
import tarfile
import io
from ocrd.workspace import Workspace

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    MIME_TO_EXT,
    getLogger,
    initLogging,
    list_resource_candidates,
    pushd_popd,
    list_all_resources,
    get_processor_resource_types,
    resource_filename,
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

# XXX imports must remain for backwards-compatibility
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class ResourceNotFoundError(FileNotFoundError):
    """
    An exception signifying the requested processor resource
    cannot be resolved.
    """
    def __init__(self, name, executable):
        self.name = name
        self.executable = executable
        self.message = "Could not find resource '%s' for executable '%s'. " \
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
                       % (name, executable, executable, name)
        super().__init__(self.message)

class Processor():
    """
    A processor is a tool that implements the uniform OCR-D command-line interface
    for run-time data processing. That is, it executes a single workflow step,
    or a combination of workflow steps, on the workspace (represented by local METS).
    It reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace : Workspace,
            ocrd_tool=None,
            parameter=None,
            input_file_grp=None,
            output_file_grp=None,
            page_id=None,
            resolve_resource=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            subcommand=None,
            show_version=False,
            dump_json=False,
            dump_module_dir=False,
            version=None
    ):
        """
        Instantiate, but do not process. Unless ``list_resources`` or
        ``show_resource`` or ``show_help`` or ``show_version`` or
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
        (parsing and validating parameters, entering the workspace directory).

        Args:
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
                 but then needs to be set before running.
        Keyword Args:
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
                 Can be ``None`` for processing, but needs to be set before running.
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                 Can be ``None`` even for processing, but then needs to be set before running.
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
                 (or empty for all pages).
             resolve_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its full path to stdout.
             show_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its contents to stdout.
             list_resources (boolean): If true, then instead of processing, find all installed \
                 resource files in the search paths and print their path names.
             show_help (boolean): If true, then instead of processing, print a usage description \
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
                 docstrings.
             subcommand (string): 'worker' or 'server', only used here for the right --help output
             show_version (boolean): If true, then instead of processing, print information on \
                 this processor's version and OCR-D version. Exit afterwards.
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
                 on stdout.
             dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
                 on stdout.
        """
        self.ocrd_tool = ocrd_tool
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if dump_module_dir:
            print(self.moduledir)
            return
        if list_resources:
            for res in self.list_all_resources():
                print(res)
            return
        if resolve_resource:
            try:
                res = self.resolve_resource(resolve_resource)
                print(res)
            except ResourceNotFoundError as e:
                log = getLogger('ocrd.processor.base')
                log.critical(e.message)
                sys.exit(1)
            return
        if show_resource:
            try:
                self.show_resource(show_resource)
            except ResourceNotFoundError as e:
                log = getLogger('ocrd.processor.base')
                log.critical(e.message)
                sys.exit(1)
            return
        if show_help:
            self.show_help(subcommand=subcommand)
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        if parameter is None:
            parameter = {}
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self, subcommand=None):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
        """
        return True

    def process(self) -> None:
        """
        Process the :py:attr:`workspace` 
        from the given :py:attr:`input_file_grp`
        to the given :py:attr:`output_file_grp`
        for the given :py:attr:`page_id`
        under the given :py:attr:`parameter`.
        
        (This contains the main functionality and needs to be overridden by subclasses.)
        """
        raise NotImplementedError()


    def add_metadata(self, pcgts):
        """
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        initLogging()
        executable = self.ocrd_tool['executable']
        log = getLogger('ocrd.processor.base')
        if exists(val):
            log.debug("Resolved to absolute path %s" % val)
            return val
        if hasattr(self, 'old_pwd'):
            cwd = self.old_pwd
        else:
            cwd = getcwd()
        ret = [cand for cand in list_resource_candidates(executable, val,
                                                         cwd=cwd, moduled=self.moduledir)
               if exists(cand)]
        if ret:
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
            return ret[0]
        raise ResourceNotFoundError(val, executable)

    def show_resource(self, val):
        res_fname = self.resolve_resource(val)
        fpath = Path(res_fname)
        if fpath.is_dir():
            with pushd_popd(fpath):
                fileobj = io.BytesIO()
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
                    tarball.add('.')
                fileobj.seek(0)
                copyfileobj(fileobj, sys.stdout.buffer)
        else:
            sys.stdout.buffer.write(fpath.read_bytes())

    def list_all_resources(self):
        """
        List all resources found in the filesystem and matching content-type by filename suffix
        """
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
            res = Path(res)
            if not '*/*' in mimetypes:
                if res.is_dir() and not 'text/directory' in mimetypes:
                    continue
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
                                             for mime in mimetypes):
                    continue
            yield res

    @property
    def module(self):
        """
        The top-level module this processor belongs to.
        """
        # find shortest prefix path that is not just a namespace package
        fqname = ''
        for name in self.__module__.split('.'):
            if fqname:
                fqname += '.'
            fqname += name
            if getattr(sys.modules[fqname], '__file__', None):
                return fqname
        # fall-back
        return self.__module__

    @property
    def moduledir(self):
        """
        The filesystem path of the module directory.
        """
        return resource_filename(self.module, '.')

    @property
    def input_files(self):
        """
        List the input files (for single-valued :py:attr:`input_file_grp`).

        For each physical page:

        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
        
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).

        Processors that expect/need multiple input file groups,
        cannot use :py:data:`input_files`. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via :py:attr:`mimetype`. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        :py:attr:`on_error`:

        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Keyword Args:
             require_first (boolean): If true, then skip a page entirely
                 whenever it is not available in the first input `fileGrp`.
             mimetype (string): If not `None`, filter by the specified MIME
                 type (literal or regex prefixed by `//`). Otherwise prefer
                 PAGE or image.
        Returns:
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            files_ = sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype)
            # Warn if no files found but pageId was specified because that
            # might be because of invalid page_id (range)
            if self.page_id and not files_:
                msg = (f"Could not find any files for --page-id {self.page_id} - "
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
                if on_error == 'abort':
                    raise ValueError(msg)
                LOG.warning(msg)
            for file_ in files_:
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                        raise ValueError(
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions.
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os.path import exists
13		from shutil import copyfileobj
14		import json
15		import os
16		from os import getcwd
17		from pathlib import Path
18		import sys
19		import tarfile
20		import io
21		from ocrd.workspace import Workspace
22
23		from ocrd_utils import (
24		VERSION as OCRD_VERSION,
25		MIMETYPE_PAGE,
26		MIME_TO_EXT,
27		getLogger,
28		initLogging,
29		list_resource_candidates,
30		pushd_popd,
31		list_all_resources,
32		get_processor_resource_types,
33		resource_filename,
34		)
35		from ocrd_validators import ParameterValidator
36		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
37
38		# XXX imports must remain for backwards-compatibility
39		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
40
41		class ResourceNotFoundError(FileNotFoundError):
42		"""
43		An exception signifying the requested processor resource
44		cannot be resolved.
45		"""
46		def __init__(self, name, executable):
47		self.name = name
48		self.executable = executable
49		self.message = "Could not find resource '%s' for executable '%s'. " \
50		"Try 'ocrd resmgr download %s %s' to download this resource." \
51		% (name, executable, executable, name)
52		super().__init__(self.message)
53
54		class Processor():
55		"""
56		A processor is a tool that implements the uniform OCR-D command-line interface
57		for run-time data processing. That is, it executes a single workflow step,
58		or a combination of workflow steps, on the workspace (represented by local METS).
59		It reads input files for all or requested physical pages of the input fileGrp(s),
60		and writes output files for them into the output fileGrp(s). It may take
61		a number of optional or mandatory parameters.
62		"""
63
64		def __init__(
65		self,
66		workspace : Workspace,
67		ocrd_tool=None,
68		parameter=None,
69		input_file_grp=None,
70		output_file_grp=None,
71		page_id=None,
72		resolve_resource=None,
73		show_resource=None,
74		list_resources=False,
75		show_help=False,
76		subcommand=None,
77		show_version=False,
78		dump_json=False,
79		dump_module_dir=False,
80		version=None
81		):
82		"""
83		Instantiate, but do not process. Unless ``list_resources`` or
84		``show_resource`` or ``show_help`` or ``show_version`` or
85		``dump_json`` or ``dump_module_dir`` is true, setup for processing
86		(parsing and validating parameters, entering the workspace directory).
87
88		Args:
89		workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
90		Can be ``None`` even for processing (esp. on multiple workspaces), \
91		but then needs to be set before running.
92		Keyword Args:
93		ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
94		Can be ``None`` for processing, but needs to be set before running.
95		parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
96		Can be ``None`` even for processing, but then needs to be set before running.
97		input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
98		output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
99		page_id (string): comma-separated list of METS physical ``page`` IDs to process \
100		(or empty for all pages).
101		resolve_resource (string): If not ``None``, then instead of processing, resolve \
102		given resource by name and print its full path to stdout.
103		show_resource (string): If not ``None``, then instead of processing, resolve \
104		given resource by name and print its contents to stdout.
105		list_resources (boolean): If true, then instead of processing, find all installed \
106		resource files in the search paths and print their path names.
107		show_help (boolean): If true, then instead of processing, print a usage description \
108		including the standard CLI and all of this processor's ocrd-tool parameters and \
109		docstrings.
110		subcommand (string): 'worker' or 'server', only used here for the right --help output
111		show_version (boolean): If true, then instead of processing, print information on \
112		this processor's version and OCR-D version. Exit afterwards.
113		dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
114		on stdout.
115		dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
116		on stdout.
117		"""
118		self.ocrd_tool = ocrd_tool
119		if dump_json:
120		print(json.dumps(ocrd_tool, indent=True))
121		return
122		if dump_module_dir:
123		print(self.moduledir)
124		return
125		if list_resources:
126		for res in self.list_all_resources():
127		print(res)
128		return
129		if resolve_resource:
130		try:
131		res = self.resolve_resource(resolve_resource)
132		print(res)
133		except ResourceNotFoundError as e:
134		log = getLogger('ocrd.processor.base')
135		log.critical(e.message)
136		sys.exit(1)
137		return
138		if show_resource:
139		try:
140		self.show_resource(show_resource)
141		except ResourceNotFoundError as e:
142		log = getLogger('ocrd.processor.base')
143		log.critical(e.message)
144		sys.exit(1)
145		return
146		if show_help:
147		self.show_help(subcommand=subcommand)
148		return
149		self.version = version
150		if show_version:
151		self.show_version()
152		return
153		self.workspace = workspace
154		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
155		# but there is no way to do that in process here since it's an
156		# overridden method. chdir is almost always an anti-pattern.
157		if self.workspace:
158		self.old_pwd = getcwd()
159		os.chdir(self.workspace.directory)
160		self.input_file_grp = input_file_grp
161		self.output_file_grp = output_file_grp
162		self.page_id = None if page_id == [] or page_id is None else page_id
163		if parameter is None:
164		parameter = {}
165		parameterValidator = ParameterValidator(ocrd_tool)
166		report = parameterValidator.validate(parameter)
167		if not report.is_valid:
168		raise Exception("Invalid parameters %s" % report.errors)
169		self.parameter = parameter
170
171		def show_help(self, subcommand=None):
172		print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
173
174		def show_version(self):
175		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
176
177		def verify(self):
178		"""
179		Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
180		"""
181		return True
182
183		def process(self) -> None:
184		"""
185		Process the :py:attr:`workspace`
186		from the given :py:attr:`input_file_grp`
187		to the given :py:attr:`output_file_grp`
188		for the given :py:attr:`page_id`
189		under the given :py:attr:`parameter`.
190
191		(This contains the main functionality and needs to be overridden by subclasses.)
192		"""
193		raise NotImplementedError()
194
195
196		def add_metadata(self, pcgts):
197		"""
198		Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
199		the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
200		"""
201		pcgts.get_Metadata().add_MetadataItem(
202		MetadataItemType(type_="processingStep",
203		name=self.ocrd_tool['steps'][0],
204		value=self.ocrd_tool['executable'],
205		Labels=[LabelsType(
206		externalModel="ocrd-tool",
207		externalId="parameters",
208		Label=[LabelType(type_=name,
209		value=self.parameter[name])
210		for name in self.parameter.keys()]),
211		LabelsType(
212		externalModel="ocrd-tool",
213		externalId="version",
214		Label=[LabelType(type_=self.ocrd_tool['executable'],
215		value=self.version),
216		LabelType(type_='ocrd/core',
217		value=OCRD_VERSION)])
218		]))
219
220		def resolve_resource(self, val):
221		"""
222		Resolve a resource name to an absolute file path with the algorithm in
223		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
224
225		Args:
226		val (string): resource value to resolve
227		"""
228		initLogging()
229		executable = self.ocrd_tool['executable']
230		log = getLogger('ocrd.processor.base')
231		if exists(val):
232		log.debug("Resolved to absolute path %s" % val)
233		return val
234		if hasattr(self, 'old_pwd'):
235		cwd = self.old_pwd
236		else:
237		cwd = getcwd()
238		ret = [cand for cand in list_resource_candidates(executable, val,
239		cwd=cwd, moduled=self.moduledir)
240		if exists(cand)]
241		if ret:
242		log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
243		return ret[0]
244		raise ResourceNotFoundError(val, executable)
245
246		def show_resource(self, val):
247		res_fname = self.resolve_resource(val)
248		fpath = Path(res_fname)
249		if fpath.is_dir():
250		with pushd_popd(fpath):
251		fileobj = io.BytesIO()
252		with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
253		tarball.add('.')
254		fileobj.seek(0)
255		copyfileobj(fileobj, sys.stdout.buffer)
256		else:
257		sys.stdout.buffer.write(fpath.read_bytes())
258
259		def list_all_resources(self):
260		"""
261		List all resources found in the filesystem and matching content-type by filename suffix
262		"""
263		mimetypes = get_processor_resource_types(None, self.ocrd_tool)
264		for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
265		res = Path(res)
266		if not '/' in mimetypes:
267		if res.is_dir() and not 'text/directory' in mimetypes:
268		continue
269		# if we do not know all MIME types, then keep the file, otherwise require suffix match
270		if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
271		for mime in mimetypes):
272		continue
273		yield res
274
275		@property
276		def module(self):
277		"""
278		The top-level module this processor belongs to.
279		"""
280		# find shortest prefix path that is not just a namespace package
281		fqname = ''
282		for name in self.__module__.split('.'):
283		if fqname:
284		fqname += '.'
285		fqname += name
286		if getattr(sys.modules[fqname], '__file__', None):
287		return fqname
288		# fall-back
289		return self.__module__
290
291		@property
292		def moduledir(self):
293		"""
294		The filesystem path of the module directory.
295		"""
296		return resource_filename(self.module, '.')
297
298		@property
299		def input_files(self):
300		"""
301		List the input files (for single-valued :py:attr:`input_file_grp`).
302
303		For each physical page:
304
305		- If there is a single PAGE-XML for the page, take it (and forget about all
306		other files for that page)
307		- Else if there is a single image file, take it (and forget about all other
308		files for that page)
309		- Otherwise raise an error (complaining that only PAGE-XML warrants
310		having multiple images for a single page)
311		Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
312
313		Returns:
314		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
315		"""
316		if not self.input_file_grp:
317		raise ValueError("Processor is missing input fileGrp")
318		ret = self.zip_input_files(mimetype=None, on_error='abort')
319		if not ret:
320		return []
321		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
322		return [tuples[0] for tuples in ret]
323
324		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
325		"""
326		List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
327
328		Processors that expect/need multiple input file groups,
329		cannot use :py:data:`input_files`. They must align (zip) input files
330		across pages. This includes the case where not all pages
331		are equally present in all file groups. It also requires
332		making a consistent selection if there are multiple files
333		per page.
334
335		Following the OCR-D functional model, this function tries to
336		find a single PAGE file per page, or fall back to a single
337		image file per page. In either case, multiple matches per page
338		are an error (see error handling below).
339		This default behaviour can be changed by using a fixed MIME
340		type filter via :py:attr:`mimetype`. But still, multiple matching
341		files per page are an error.
342
343		Single-page multiple-file errors are handled according to
344		:py:attr:`on_error`:
345
346		- if ``skip``, then the page for the respective fileGrp will be
347		silently skipped (as if there was no match at all)
348		- if ``first``, then the first matching file for the page will be
349		silently selected (as if the first was the only match)
350		- if ``last``, then the last matching file for the page will be
351		silently selected (as if the last was the only match)
352		- if ``abort``, then an exception will be raised.
353		Multiple matches for PAGE-XML will always raise an exception.
354
355		Keyword Args:
356		require_first (boolean): If true, then skip a page entirely
357		whenever it is not available in the first input `fileGrp`.
358		mimetype (string): If not `None`, filter by the specified MIME
359		type (literal or regex prefixed by `//`). Otherwise prefer
360		PAGE or image.
361		Returns:
362		A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
363		"""
364		if not self.input_file_grp:
365		raise ValueError("Processor is missing input fileGrp")
366
367		LOG = getLogger('ocrd.processor.base')
368		ifgs = self.input_file_grp.split(",")
369		# Iterating over all files repeatedly may seem inefficient at first sight,
370		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
371		# can actually be much more costly than traversing the ltree.
372		# This might depend on the number of pages vs number of fileGrps.
373
374		pages = dict()
375		for i, ifg in enumerate(ifgs):
376		files_ = sorted(self.workspace.mets.find_all_files(
377		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
378		# sort by MIME type so PAGE comes before images
379		key=lambda file_: file_.mimetype)
380		# Warn if no files found but pageId was specified because that
381		# might be because of invalid page_id (range)
382		if self.page_id and not files_:
383		msg = (f"Could not find any files for --page-id {self.page_id} - "
384		f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
385		if on_error == 'abort':
386		raise ValueError(msg)
387		LOG.warning(msg)
388		for file_ in files_:
389		if not file_.pageId:
390		continue
391		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
392		if ift[i]:
393		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
394		# fileGrp has multiple files for this page ID
395		if mimetype:
396		# filter was active, this must not happen
397	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
398		ift[i] = None
399		elif on_error == 'first':
400		pass # keep first match
401		elif on_error == 'last':
402		ift[i] = file_
403		elif on_error == 'abort':
404		raise ValueError(
405		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
406		mimetype, file_.pageId, ifg))
407		else:
408		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
409		elif (ift[i].mimetype == MIMETYPE_PAGE and
410		file_.mimetype != MIMETYPE_PAGE):
411		pass # keep PAGE match
412		elif (ift[i].mimetype == MIMETYPE_PAGE and
413		file_.mimetype == MIMETYPE_PAGE):
414		raise ValueError(
415		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
416		file_.pageId, ifg))
417		else:
418		# filter was inactive but no PAGE is in control, this must not happen
419	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
420		ift[i] = None
421		elif on_error == 'first':
422		pass # keep first match
423		elif on_error == 'last':
424		ift[i] = file_
425		elif on_error == 'abort':
426		raise ValueError(
427		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
428		file_.pageId, ifg))
429		else:
430		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
431		else:
432		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
433		ift[i] = file_
434		ifts = list()
435		for page, ifiles in pages.items():
436		for i, ifg in enumerate(ifgs):
437		if not ifiles[i]:
438		# other fallback options?
439		LOG.error('found no page %s in file group %s',
440		page, ifg)
441		if ifiles[0] or not require_first:
442		ifts.append(tuple(ifiles))
443		return ifts
444

OCR-D / core

Push — master ( 648be5...719bbc )

ocrd.processor.base F

Complexity

Size/Duplication

Importance

14 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like