ocrd.processor.base.Processor.input_files() - Code Metrics - Inspection of "resmgr: No downloading on-demand" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#663)

by Konstantin

created 2021-01-27 17:29 UTC

ocrd.processor.base.Processor.input_files() A

↳ Parent: ocrd.processor.base

Complexity

Conditions

Size

Total Lines	21
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	21
rs	9.95
c	0
b	0
f	0
cc	3
nop	1

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
from os import getcwd
import re
import sys

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    list_all_resources,
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
from ..resource_manager import OcrdResourceManager

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in list_all_resources(ocrd_tool['executable']):
                print(res)
            return
        if show_resource:
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
            else:
                with open(res_fname[0], 'rb') as f:
                    copyfileobj(f, sys.stdout.buffer)
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        log = getLogger('ocrd.%s.resolve_resource' % executable)
        if exists(val):
            log.debug("Resolved to absolute path %s" % val)
            return val
        ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
        if ret:
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
            return ret[0]
        log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource.",
                val, executable, executable, val)
        sys.exit(1)

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		from os import getcwd
18		import re
19		import sys
20
21		import requests
22
23		from ocrd_utils import (
24		VERSION as OCRD_VERSION,
25		MIMETYPE_PAGE,
26		getLogger,
27		initLogging,
28		list_resource_candidates,
29		list_all_resources,
30		)
31		from ocrd_validators import ParameterValidator
32		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
33		from ..resource_manager import OcrdResourceManager
34
35		# XXX imports must remain for backwards-compatibilty
36		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38		class Processor():
39		"""
40		A processor is an OCR-D compliant command-line-interface for executing
41		a single workflow step on the workspace (represented by local METS). It
42		reads input files for all or requested physical pages of the input fileGrp(s),
43		and writes output files for them into the output fileGrp(s). It may take
44		a number of optional or mandatory parameters.
45		"""
46
47		def __init__(
48		self,
49		workspace,
50		ocrd_tool=None,
51		parameter=None,
52		# TODO OCR-D/core#274
53		# input_file_grp=None,
54		# output_file_grp=None,
55		input_file_grp="INPUT",
56		output_file_grp="OUTPUT",
57		page_id=None,
58		show_resource=None,
59		list_resources=False,
60		show_help=False,
61		show_version=False,
62		dump_json=False,
63		version=None
64		):
65		if parameter is None:
66		parameter = {}
67		if dump_json:
68		print(json.dumps(ocrd_tool, indent=True))
69		return
70		if list_resources:
71		for res in list_all_resources(ocrd_tool['executable']):
72		print(res)
73		return
74		if show_resource:
75		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
76		if not res_fname:
77		initLogging()
78		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
79		logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
80		else:
81		with open(res_fname[0], 'rb') as f:
82		copyfileobj(f, sys.stdout.buffer)
83		return
84		self.ocrd_tool = ocrd_tool
85		if show_help:
86		self.show_help()
87		return
88		self.version = version
89		if show_version:
90		self.show_version()
91		return
92		self.workspace = workspace
93		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
94		# but there is no way to do that in process here since it's an
95		# overridden method. chdir is almost always an anti-pattern.
96		if self.workspace:
97		self.old_pwd = getcwd()
98		os.chdir(self.workspace.directory)
99		self.input_file_grp = input_file_grp
100		self.output_file_grp = output_file_grp
101		self.page_id = None if page_id == [] or page_id is None else page_id
102		parameterValidator = ParameterValidator(ocrd_tool)
103		report = parameterValidator.validate(parameter)
104		if not report.is_valid:
105		raise Exception("Invalid parameters %s" % report.errors)
106		self.parameter = parameter
107
108		def show_help(self):
109		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
110
111		def show_version(self):
112		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
113
114		def verify(self):
115		"""
116		Verify that the input fulfills the processor's requirements.
117		"""
118		return True
119
120		def process(self):
121		"""
122		Process the workspace
123		"""
124		raise Exception("Must be implemented")
125
126
127		def add_metadata(self, pcgts):
128		"""
129		Adds PAGE-XML MetadataItem describing the processing step
130		"""
131		pcgts.get_Metadata().add_MetadataItem(
132		MetadataItemType(type_="processingStep",
133		name=self.ocrd_tool['steps'][0],
134		value=self.ocrd_tool['executable'],
135		Labels=[LabelsType(
136		externalModel="ocrd-tool",
137		externalId="parameters",
138		Label=[LabelType(type_=name,
139		value=self.parameter[name])
140		for name in self.parameter.keys()]),
141		LabelsType(
142		externalModel="ocrd-tool",
143		externalId="version",
144		Label=[LabelType(type_=self.ocrd_tool['executable'],
145		value=self.version),
146		LabelType(type_='ocrd/core',
147		value=OCRD_VERSION)])
148		]))
149
150		def resolve_resource(self, val):
151		"""
152		Resolve a resource name to an absolute file path with the algorithm in
153		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
154
155		Args:
156		val (string): resource value to resolve
157		"""
158		executable = self.ocrd_tool['executable']
159		log = getLogger('ocrd.%s.resolve_resource' % executable)
160		if exists(val):
161		log.debug("Resolved to absolute path %s" % val)
162		return val
163		ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
164		if ret:
165		log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
166		return ret[0]
167		log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource.",
168		val, executable, executable, val)
169		sys.exit(1)
170
171		def list_all_resources(self):
172		"""
173		List all resources found in the filesystem
174		"""
175		return list_all_resources(self.ocrd_tool['executable'])
176
177		@property
178		def input_files(self):
179		"""
180		List the input files (for single input file groups).
181
182		For each physical page:
183		- If there is a single PAGE-XML for the page, take it (and forget about all
184		other files for that page)
185		- Else if there is a single image file, take it (and forget about all other
186		files for that page)
187		- Otherwise raise an error (complaining that only PAGE-XML warrants
188		having multiple images for a single page)
189		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
190		"""
191		if not self.input_file_grp:
192		raise ValueError("Processor is missing input fileGrp")
193		ret = self.zip_input_files(mimetype=None, on_error='abort')
194		if not ret:
195		return []
196		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
197		return [tuples[0] for tuples in ret]
198
199		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
200		"""
201		List tuples of input files (for multiple input file groups).
202
203		Processors that expect/need multiple input file groups,
204		cannot use ``input_files``. They must align (zip) input files
205		across pages. This includes the case where not all pages
206		are equally present in all file groups. It also requires
207		making a consistent selection if there are multiple files
208		per page.
209
210		Following the OCR-D functional model, this function tries to
211		find a single PAGE file per page, or fall back to a single
212		image file per page. In either case, multiple matches per page
213		are an error (see error handling below).
214		This default behaviour can be changed by using a fixed MIME
215		type filter via ``mimetype``. But still, multiple matching
216		files per page are an error.
217
218		Single-page multiple-file errors are handled according to
219		``on_error``:
220		- if ``skip``, then the page for the respective fileGrp will be
221		silently skipped (as if there was no match at all)
222		- if ``first``, then the first matching file for the page will be
223		silently selected (as if the first was the only match)
224		- if ``last``, then the last matching file for the page will be
225		silently selected (as if the last was the only match)
226		- if ``abort``, then an exception will be raised.
227		Multiple matches for PAGE-XML will always raise an exception.
228
229		Args:
230		require_first (bool): If true, then skip a page entirely
231		whenever it is not available in the first input fileGrp.
232
233		mimetype (str): If not None, filter by the specified MIME
234		type (literal or regex prefixed by ``//``.
235		Otherwise prefer PAGE or image.
236		"""
237		if not self.input_file_grp:
238		raise ValueError("Processor is missing input fileGrp")
239
240		LOG = getLogger('ocrd.processor.base')
241		ifgs = self.input_file_grp.split(",")
242		# Iterating over all files repeatedly may seem inefficient at first sight,
243		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
244		# can actually be much more costly than traversing the ltree.
245		# This might depend on the number of pages vs number of fileGrps.
246
247		pages = dict()
248		for i, ifg in enumerate(ifgs):
249		for file_ in sorted(self.workspace.mets.find_all_files(
250		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
251		# sort by MIME type so PAGE comes before images
252		key=lambda file_: file_.mimetype):
253		if not file_.pageId:
254		continue
255		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
256		if ift[i]:
257		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
258		# fileGrp has multiple files for this page ID
259		if mimetype:
260		# filter was active, this must not happen
261	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
262		ift[i] = None
263		elif on_error == 'first':
264		pass # keep first match
265		elif on_error == 'last':
266		ift[i] = file_
267		elif on_error == 'abort':
268		raise ValueError(
269		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
270		mimetype, file_.pageId, ifg))
271		else:
272		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
273		elif (ift[i].mimetype == MIMETYPE_PAGE and
274		file_.mimetype != MIMETYPE_PAGE):
275		pass # keep PAGE match
276		elif (ift[i].mimetype == MIMETYPE_PAGE and
277		file_.mimetype == MIMETYPE_PAGE):
278		raise ValueError(
279		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
280		file_.pageId, ifg))
281		else:
282		# filter was inactive but no PAGE is in control, this must not happen
283	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
284		ift[i] = None
285		elif on_error == 'first':
286		pass # keep first match
287		elif on_error == 'last':
288		ift[i] = file_
289		elif on_error == 'abort':
290		raise ValueError(
291		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
292		file_.pageId, ifg))
293		else:
294		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
295		else:
296		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
297		ift[i] = file_
298		ifts = list()
299		for page, ifiles in pages.items():
300		for i, ifg in enumerate(ifgs):
301		if not ifiles[i]:
302		# other fallback options?
303		LOG.error('found no page %s in file group %s',
304		page, ifg)
305		if ifiles[0] or not require_first:
306		ifts.append(tuple(ifiles))
307		return ifts
308

OCR-D / core

Pull Request — master (#663)

ocrd.processor.base.Processor.input_files() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like