ocrd.processor.base.Processor.input_files() - Code Metrics - Inspection of "Processor resource discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#559)

by Konstantin

created 2021-01-20 15:38 UTC

ocrd.processor.base.Processor.input_files() A

↳ Parent: ocrd.processor.base

Complexity

Conditions

Size

Total Lines	21
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	21
rs	9.95
c	0
b	0
f	0
cc	3
nop	1

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
import re
import sys

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    list_all_resources,
    XDG_CACHE_HOME
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
from ..resource_manager import OcrdResourceManager

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in list_all_resources(ocrd_tool['executable']):
                print(res)
            return
        if show_resource:
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
            else:
                with open(res_fname[0], 'rb') as f:
                    copyfileobj(f, sys.stdout.buffer)
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        if exists(val):
            return val
        ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
        if ret:
            return ret[0]
        resmgr = OcrdResourceManager()
        reslist = resmgr.find_resources(executable, name=val)
        if not reslist:
            reslist = resmgr.find_resources(executable, url=val)
        if not reslist:
            raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
        _, resdict = reslist[0]
        return str(resmgr.download(
            executable,
            url=resdict['url'],
            name=resdict['name'],
            path_in_archive=resdict['path_in_archive'],
            resource_type=resdict['type']
        ))

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		import re
18		import sys
19
20		import requests
21
22		from ocrd_utils import (
23		VERSION as OCRD_VERSION,
24		MIMETYPE_PAGE,
25		getLogger,
26		initLogging,
27		list_resource_candidates,
28		list_all_resources,
29		XDG_CACHE_HOME
30		)
31		from ocrd_validators import ParameterValidator
32		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
33		from ..resource_manager import OcrdResourceManager
34
35		# XXX imports must remain for backwards-compatibilty
36		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38		class Processor():
39		"""
40		A processor is an OCR-D compliant command-line-interface for executing
41		a single workflow step on the workspace (represented by local METS). It
42		reads input files for all or requested physical pages of the input fileGrp(s),
43		and writes output files for them into the output fileGrp(s). It may take
44		a number of optional or mandatory parameters.
45		"""
46
47		def __init__(
48		self,
49		workspace,
50		ocrd_tool=None,
51		parameter=None,
52		# TODO OCR-D/core#274
53		# input_file_grp=None,
54		# output_file_grp=None,
55		input_file_grp="INPUT",
56		output_file_grp="OUTPUT",
57		page_id=None,
58		show_resource=None,
59		list_resources=False,
60		show_help=False,
61		show_version=False,
62		dump_json=False,
63		version=None
64		):
65		if parameter is None:
66		parameter = {}
67		if dump_json:
68		print(json.dumps(ocrd_tool, indent=True))
69		return
70		if list_resources:
71		for res in list_all_resources(ocrd_tool['executable']):
72		print(res)
73		return
74		if show_resource:
75		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
76		if not res_fname:
77		initLogging()
78		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
79		logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
80		else:
81		with open(res_fname[0], 'rb') as f:
82		copyfileobj(f, sys.stdout.buffer)
83		return
84		self.ocrd_tool = ocrd_tool
85		if show_help:
86		self.show_help()
87		return
88		self.version = version
89		if show_version:
90		self.show_version()
91		return
92		self.workspace = workspace
93		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
94		# but there is no way to do that in process here since it's an
95		# overridden method. chdir is almost always an anti-pattern.
96		if self.workspace:
97		os.chdir(self.workspace.directory)
98		self.input_file_grp = input_file_grp
99		self.output_file_grp = output_file_grp
100		self.page_id = None if page_id == [] or page_id is None else page_id
101		parameterValidator = ParameterValidator(ocrd_tool)
102		report = parameterValidator.validate(parameter)
103		if not report.is_valid:
104		raise Exception("Invalid parameters %s" % report.errors)
105		self.parameter = parameter
106
107		def show_help(self):
108		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
109
110		def show_version(self):
111		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
112
113		def verify(self):
114		"""
115		Verify that the input fulfills the processor's requirements.
116		"""
117		return True
118
119		def process(self):
120		"""
121		Process the workspace
122		"""
123		raise Exception("Must be implemented")
124
125
126		def add_metadata(self, pcgts):
127		"""
128		Adds PAGE-XML MetadataItem describing the processing step
129		"""
130		pcgts.get_Metadata().add_MetadataItem(
131		MetadataItemType(type_="processingStep",
132		name=self.ocrd_tool['steps'][0],
133		value=self.ocrd_tool['executable'],
134		Labels=[LabelsType(
135		externalModel="ocrd-tool",
136		externalId="parameters",
137		Label=[LabelType(type_=name,
138		value=self.parameter[name])
139		for name in self.parameter.keys()]),
140		LabelsType(
141		externalModel="ocrd-tool",
142		externalId="version",
143		Label=[LabelType(type_=self.ocrd_tool['executable'],
144		value=self.version),
145		LabelType(type_='ocrd/core',
146		value=OCRD_VERSION)])
147		]))
148
149		def resolve_resource(self, val):
150		"""
151		Resolve a resource name to an absolute file path with the algorithm in
152		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
153
154		Args:
155		val (string): resource value to resolve
156		"""
157		executable = self.ocrd_tool['executable']
158		if exists(val):
159		return val
160		ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
161		if ret:
162		return ret[0]
163		resmgr = OcrdResourceManager()
164		reslist = resmgr.find_resources(executable, name=val)
165		if not reslist:
166		reslist = resmgr.find_resources(executable, url=val)
167		if not reslist:
168		raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
169		_, resdict = reslist[0]
170		return str(resmgr.download(
171		executable,
172		url=resdict['url'],
173		name=resdict['name'],
174		path_in_archive=resdict['path_in_archive'],
175		resource_type=resdict['type']
176		))
177
178		def list_all_resources(self):
179		"""
180		List all resources found in the filesystem
181		"""
182		return list_all_resources(self.ocrd_tool['executable'])
183
184		@property
185		def input_files(self):
186		"""
187		List the input files (for single input file groups).
188
189		For each physical page:
190		- If there is a single PAGE-XML for the page, take it (and forget about all
191		other files for that page)
192		- Else if there is a single image file, take it (and forget about all other
193		files for that page)
194		- Otherwise raise an error (complaining that only PAGE-XML warrants
195		having multiple images for a single page)
196		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
197		"""
198		if not self.input_file_grp:
199		raise ValueError("Processor is missing input fileGrp")
200		ret = self.zip_input_files(mimetype=None, on_error='abort')
201		if not ret:
202		return []
203		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
204		return [tuples[0] for tuples in ret]
205
206		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
207		"""
208		List tuples of input files (for multiple input file groups).
209
210		Processors that expect/need multiple input file groups,
211		cannot use ``input_files``. They must align (zip) input files
212		across pages. This includes the case where not all pages
213		are equally present in all file groups. It also requires
214		making a consistent selection if there are multiple files
215		per page.
216
217		Following the OCR-D functional model, this function tries to
218		find a single PAGE file per page, or fall back to a single
219		image file per page. In either case, multiple matches per page
220		are an error (see error handling below).
221		This default behaviour can be changed by using a fixed MIME
222		type filter via ``mimetype``. But still, multiple matching
223		files per page are an error.
224
225		Single-page multiple-file errors are handled according to
226		``on_error``:
227		- if ``skip``, then the page for the respective fileGrp will be
228		silently skipped (as if there was no match at all)
229		- if ``first``, then the first matching file for the page will be
230		silently selected (as if the first was the only match)
231		- if ``last``, then the last matching file for the page will be
232		silently selected (as if the last was the only match)
233		- if ``abort``, then an exception will be raised.
234		Multiple matches for PAGE-XML will always raise an exception.
235
236		Args:
237		require_first (bool): If true, then skip a page entirely
238		whenever it is not available in the first input fileGrp.
239
240		mimetype (str): If not None, filter by the specified MIME
241		type (literal or regex prefixed by ``//``.
242		Otherwise prefer PAGE or image.
243		"""
244		if not self.input_file_grp:
245		raise ValueError("Processor is missing input fileGrp")
246
247		LOG = getLogger('ocrd.processor.base')
248		ifgs = self.input_file_grp.split(",")
249		# Iterating over all files repeatedly may seem inefficient at first sight,
250		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
251		# can actually be much more costly than traversing the ltree.
252		# This might depend on the number of pages vs number of fileGrps.
253
254		pages = dict()
255		for i, ifg in enumerate(ifgs):
256		for file_ in sorted(self.workspace.mets.find_all_files(
257		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
258		# sort by MIME type so PAGE comes before images
259		key=lambda file_: file_.mimetype):
260		if not file_.pageId:
261		continue
262		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
263		if ift[i]:
264		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
265		# fileGrp has multiple files for this page ID
266		if mimetype:
267		# filter was active, this must not happen
268	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
269		ift[i] = None
270		elif on_error == 'first':
271		pass # keep first match
272		elif on_error == 'last':
273		ift[i] = file_
274		elif on_error == 'abort':
275		raise ValueError(
276		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
277		mimetype, file_.pageId, ifg))
278		else:
279		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
280		elif (ift[i].mimetype == MIMETYPE_PAGE and
281		file_.mimetype != MIMETYPE_PAGE):
282		pass # keep PAGE match
283		elif (ift[i].mimetype == MIMETYPE_PAGE and
284		file_.mimetype == MIMETYPE_PAGE):
285		raise ValueError(
286		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
287		file_.pageId, ifg))
288		else:
289		# filter was inactive but no PAGE is in control, this must not happen
290	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
291		ift[i] = None
292		elif on_error == 'first':
293		pass # keep first match
294		elif on_error == 'last':
295		ift[i] = file_
296		elif on_error == 'abort':
297		raise ValueError(
298		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
299		file_.pageId, ifg))
300		else:
301		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
302		else:
303		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
304		ift[i] = file_
305		ifts = list()
306		for page, ifiles in pages.items():
307		for i, ifg in enumerate(ifgs):
308		if not ifiles[i]:
309		# other fallback options?
310		LOG.error('found no page %s in file group %s',
311		page, ifg)
312		if ifiles[0] or not require_first:
313		ifts.append(tuple(ifiles))
314		return ifts
315

OCR-D / core

Pull Request — master (#559)

ocrd.processor.base.Processor.input_files() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like