ocrd.processor.base.Processor.show_help() - Code Metrics - Inspection of "Processor resource discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#559)

by Konstantin

created 2020-12-11 12:39 UTC

ocrd.processor.base.Processor.show_help() A

↳ Parent: ocrd.processor.base

Complexity

Conditions

Size

Total Lines	2
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	2
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
import re
from pkg_resources import resource_filename

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    list_resource_candidates,
    list_all_resources,
    XDG_CACHE_HOME
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, parameter_name, val):
        """
        Resolve a resource name with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            parameter_name (string): name of parameter to resolve resource for
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        try:
            param = self.ocrd_tool['parameter'][parameter_name]
        except KeyError:
            raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
        if not param['mimetype']:
            raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
                             parameter_name)
        if val.startswith('http:') or val.startswith('https:'):
            cache_dir = join(XDG_CACHE_HOME, executable)
            cache_key = re.sub('[^A-Za-z0-9]', '', val)
            cache_fpath = join(cache_dir, cache_key)
            # TODO Proper caching (make head request for size, If-Modified etc)
            if not exists(cache_fpath):
                if not isdir(cache_dir):
                    makedirs(cache_dir)
                with requests.get(val, stream=True) as r:
                    with open(cache_fpath, 'wb') as f:
                        copyfileobj(r.raw, f)
            return cache_fpath
        ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
        if ret:
            return ret
        bundled_fpath = resource_filename(__name__, val)
        if exists(bundled_fpath):
            return bundled_fpath
        raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
                                (parameter_name, val))

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')

        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		import re
18		from pkg_resources import resource_filename
19
20		import requests
21
22		from ocrd_utils import (
23		VERSION as OCRD_VERSION,
24		MIMETYPE_PAGE,
25		list_resource_candidates,
26		list_all_resources,
27		XDG_CACHE_HOME
28		)
29		from ocrd_validators import ParameterValidator
30		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
31
32		# XXX imports must remain for backwards-compatibilty
33		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
34
35		class Processor():
36		"""
37		A processor is an OCR-D compliant command-line-interface for executing
38		a single workflow step on the workspace (represented by local METS). It
39		reads input files for all or requested physical pages of the input fileGrp(s),
40		and writes output files for them into the output fileGrp(s). It may take
41		a number of optional or mandatory parameters.
42		"""
43
44		def __init__(
45		self,
46		workspace,
47		ocrd_tool=None,
48		parameter=None,
49		# TODO OCR-D/core#274
50		# input_file_grp=None,
51		# output_file_grp=None,
52		input_file_grp="INPUT",
53		output_file_grp="OUTPUT",
54		page_id=None,
55		show_help=False,
56		show_version=False,
57		dump_json=False,
58		version=None
59		):
60		if parameter is None:
61		parameter = {}
62		if dump_json:
63		print(json.dumps(ocrd_tool, indent=True))
64		return
65		self.ocrd_tool = ocrd_tool
66		if show_help:
67		self.show_help()
68		return
69		self.version = version
70		if show_version:
71		self.show_version()
72		return
73		self.workspace = workspace
74		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
75		# but there is no way to do that in process here since it's an
76		# overridden method. chdir is almost always an anti-pattern.
77		if self.workspace:
78		os.chdir(self.workspace.directory)
79		self.input_file_grp = input_file_grp
80		self.output_file_grp = output_file_grp
81		self.page_id = None if page_id == [] or page_id is None else page_id
82		parameterValidator = ParameterValidator(ocrd_tool)
83		report = parameterValidator.validate(parameter)
84		if not report.is_valid:
85		raise Exception("Invalid parameters %s" % report.errors)
86		self.parameter = parameter
87
88		def show_help(self):
89		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
90
91		def show_version(self):
92		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
93
94		def verify(self):
95		"""
96		Verify that the input fulfills the processor's requirements.
97		"""
98		return True
99
100		def process(self):
101		"""
102		Process the workspace
103		"""
104		raise Exception("Must be implemented")
105
106
107		def add_metadata(self, pcgts):
108		"""
109		Adds PAGE-XML MetadataItem describing the processing step
110		"""
111		pcgts.get_Metadata().add_MetadataItem(
112		MetadataItemType(type_="processingStep",
113		name=self.ocrd_tool['steps'][0],
114		value=self.ocrd_tool['executable'],
115		Labels=[LabelsType(
116		externalModel="ocrd-tool",
117		externalId="parameters",
118		Label=[LabelType(type_=name,
119		value=self.parameter[name])
120		for name in self.parameter.keys()]),
121		LabelsType(
122		externalModel="ocrd-tool",
123		externalId="version",
124		Label=[LabelType(type_=self.ocrd_tool['executable'],
125		value=self.version),
126		LabelType(type_='ocrd/core',
127		value=OCRD_VERSION)])
128		]))
129
130		def resolve_resource(self, parameter_name, val):
131		"""
132		Resolve a resource name with the algorithm in
133		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
134
135		Args:
136		parameter_name (string): name of parameter to resolve resource for
137		val (string): resource value to resolve
138		"""
139		executable = self.ocrd_tool['executable']
140		try:
141		param = self.ocrd_tool['parameter'][parameter_name]
142		except KeyError:
143		raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
144		if not param['mimetype']:
145		raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
146		parameter_name)
147		if val.startswith('http:') or val.startswith('https:'):
148		cache_dir = join(XDG_CACHE_HOME, executable)
149		cache_key = re.sub('[^A-Za-z0-9]', '', val)
150		cache_fpath = join(cache_dir, cache_key)
151		# TODO Proper caching (make head request for size, If-Modified etc)
152		if not exists(cache_fpath):
153		if not isdir(cache_dir):
154		makedirs(cache_dir)
155		with requests.get(val, stream=True) as r:
156		with open(cache_fpath, 'wb') as f:
157		copyfileobj(r.raw, f)
158		return cache_fpath
159		ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
160		if ret:
161		return ret
162		bundled_fpath = resource_filename(__name__, val)
163		if exists(bundled_fpath):
164		return bundled_fpath
165		raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
166		(parameter_name, val))
167
168		def list_all_resources(self):
169		"""
170		List all resources found in the filesystem
171		"""
172		return list_all_resources(self.ocrd_tool['executable'])
173
174		@property
175		def input_files(self):
176		"""
177		List the input files (for single input file groups).
178
179		For each physical page:
180		- If there is a single PAGE-XML for the page, take it (and forget about all
181		other files for that page)
182		- Else if there is a single image file, take it (and forget about all other
183		files for that page)
184		- Otherwise raise an error (complaining that only PAGE-XML warrants
185		having multiple images for a single page)
186		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
187		"""
188		if not self.input_file_grp:
189		raise ValueError("Processor is missing input fileGrp")
190		ret = self.zip_input_files(mimetype=None, on_error='abort')
191		if not ret:
192		return []
193		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
194		return [tuples[0] for tuples in ret]
195
196		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
197		"""
198		List tuples of input files (for multiple input file groups).
199
200		Processors that expect/need multiple input file groups,
201		cannot use ``input_files``. They must align (zip) input files
202		across pages. This includes the case where not all pages
203		are equally present in all file groups. It also requires
204		making a consistent selection if there are multiple files
205		per page.
206
207		Following the OCR-D functional model, this function tries to
208		find a single PAGE file per page, or fall back to a single
209		image file per page. In either case, multiple matches per page
210		are an error (see error handling below).
211		This default behaviour can be changed by using a fixed MIME
212		type filter via ``mimetype``. But still, multiple matching
213		files per page are an error.
214
215		Single-page multiple-file errors are handled according to
216		``on_error``:
217		- if ``skip``, then the page for the respective fileGrp will be
218		silently skipped (as if there was no match at all)
219		- if ``first``, then the first matching file for the page will be
220		silently selected (as if the first was the only match)
221		- if ``last``, then the last matching file for the page will be
222		silently selected (as if the last was the only match)
223		- if ``abort``, then an exception will be raised.
224		Multiple matches for PAGE-XML will always raise an exception.
225
226		Args:
227		require_first (bool): If true, then skip a page entirely
228		whenever it is not available in the first input fileGrp.
229
230		mimetype (str): If not None, filter by the specified MIME
231		type (literal or regex prefixed by ``//``.
232		Otherwise prefer PAGE or image.
233		"""
234		if not self.input_file_grp:
235		raise ValueError("Processor is missing input fileGrp")
236
237		LOG = getLogger('ocrd.processor.base')
		0 ignored issues – show Comprehensibility Best Practice introduced 2020-12-11 12:41 UTC by Report Bug Copy Issue Report The variable `getLogger` does not seem to be defined. Loading history...
238		ifgs = self.input_file_grp.split(",")
239		# Iterating over all files repeatedly may seem inefficient at first sight,
240		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
241		# can actually be much more costly than traversing the ltree.
242		# This might depend on the number of pages vs number of fileGrps.
243
244		pages = dict()
245		for i, ifg in enumerate(ifgs):
246		for file_ in sorted(self.workspace.mets.find_all_files(
247		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
248		# sort by MIME type so PAGE comes before images
249		key=lambda file_: file_.mimetype):
250		if not file_.pageId:
251		continue
252		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
253		if ift[i]:
254		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
255		# fileGrp has multiple files for this page ID
256		if mimetype:
257		# filter was active, this must not happen
258	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
259		ift[i] = None
260		elif on_error == 'first':
261		pass # keep first match
262		elif on_error == 'last':
263		ift[i] = file_
264		elif on_error == 'abort':
265		raise ValueError(
266		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
267		mimetype, file_.pageId, ifg))
268		else:
269		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
270		elif (ift[i].mimetype == MIMETYPE_PAGE and
271		file_.mimetype != MIMETYPE_PAGE):
272		pass # keep PAGE match
273		elif (ift[i].mimetype == MIMETYPE_PAGE and
274		file_.mimetype == MIMETYPE_PAGE):
275		raise ValueError(
276		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
277		file_.pageId, ifg))
278		else:
279		# filter was inactive but no PAGE is in control, this must not happen
280	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
281		ift[i] = None
282		elif on_error == 'first':
283		pass # keep first match
284		elif on_error == 'last':
285		ift[i] = file_
286		elif on_error == 'abort':
287		raise ValueError(
288		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
289		file_.pageId, ifg))
290		else:
291		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
292		else:
293		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
294		ift[i] = file_
295		ifts = list()
296		for page, ifiles in pages.items():
297		for i, ifg in enumerate(ifgs):
298		if not ifiles[i]:
299		# other fallback options?
300		LOG.error('found no page %s in file group %s',
301		page, ifg)
302		if ifiles[0] or not require_first:
303		ifts.append(tuple(ifiles))
304		return ifts
305

OCR-D / core

Pull Request — master (#559)

ocrd.processor.base.Processor.show_help() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like