ocrd.processor.base - Code Metrics - Inspection of "Processor resource discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#559)

by Konstantin

created 2020-12-21 18:41 UTC

ocrd.processor.base D

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	324
Duplicated Lines	7.41 %

Importance

Changes

Metric	Value
wmc	58
eloc	201
dl	24
loc	324
rs	4.5599
c	0
b	0
f	0

10 Methods

Rating	Name	Duplication	Size	Complexity
F	Processor.zip_input_files()	24	109	25
A	Processor.show_version()	0	2	1
A	Processor.process()	0	5	1
D	Processor.__init__()	0	59	13
A	Processor.list_all_resources()	0	5	1
A	Processor.verify()	0	5	1
C	Processor.resolve_resource()	0	37	11
A	Processor.add_metadata()	0	21	1
A	Processor.input_files()	0	21	3
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
import re
import sys
from pkg_resources import resource_filename

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    list_all_resources,
    XDG_CACHE_HOME
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in list_all_resources(ocrd_tool['executable']):
                print(res)
            return
        if show_resource:
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
            else:
                with open(res_fname[0], 'rb') as f:
                    copyfileobj(f, sys.stdout.buffer)
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, parameter_name, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            parameter_name (string): name of parameter to resolve resource for
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        try:
            param = self.ocrd_tool['parameter'][parameter_name]
        except KeyError:
            raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
        if not param['mimetype']:
            raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
                             parameter_name)
        if val.startswith('http:') or val.startswith('https:'):
            cache_dir = join(XDG_CACHE_HOME, executable)
            cache_key = re.sub('[^A-Za-z0-9]', '', val)
            cache_fpath = join(cache_dir, cache_key)
            # TODO Proper caching (make head request for size, If-Modified etc)
            if not exists(cache_fpath):
                if not isdir(cache_dir):
                    makedirs(cache_dir)
                with requests.get(val, stream=True) as r:
                    with open(cache_fpath, 'wb') as f:
                        copyfileobj(r.raw, f)
            return cache_fpath
        ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
        if ret:
            return ret
        bundled_fpath = resource_filename(__name__, val)
        if exists(bundled_fpath):
            return bundled_fpath
        raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
                                (parameter_name, val))

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		import re
18		import sys
19		from pkg_resources import resource_filename
20
21		import requests
22
23		from ocrd_utils import (
24		VERSION as OCRD_VERSION,
25		MIMETYPE_PAGE,
26		getLogger,
27		initLogging,
28		list_resource_candidates,
29		list_all_resources,
30		XDG_CACHE_HOME
31		)
32		from ocrd_validators import ParameterValidator
33		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
34
35		# XXX imports must remain for backwards-compatibilty
36		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38		class Processor():
39		"""
40		A processor is an OCR-D compliant command-line-interface for executing
41		a single workflow step on the workspace (represented by local METS). It
42		reads input files for all or requested physical pages of the input fileGrp(s),
43		and writes output files for them into the output fileGrp(s). It may take
44		a number of optional or mandatory parameters.
45		"""
46
47		def __init__(
48		self,
49		workspace,
50		ocrd_tool=None,
51		parameter=None,
52		# TODO OCR-D/core#274
53		# input_file_grp=None,
54		# output_file_grp=None,
55		input_file_grp="INPUT",
56		output_file_grp="OUTPUT",
57		page_id=None,
58		show_resource=None,
59		list_resources=False,
60		show_help=False,
61		show_version=False,
62		dump_json=False,
63		version=None
64		):
65		if parameter is None:
66		parameter = {}
67		if dump_json:
68		print(json.dumps(ocrd_tool, indent=True))
69		return
70		if list_resources:
71		for res in list_all_resources(ocrd_tool['executable']):
72		print(res)
73		return
74		if show_resource:
75		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
76		if not res_fname:
77		initLogging()
78		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
79		logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
80		else:
81		with open(res_fname[0], 'rb') as f:
82		copyfileobj(f, sys.stdout.buffer)
83		return
84		self.ocrd_tool = ocrd_tool
85		if show_help:
86		self.show_help()
87		return
88		self.version = version
89		if show_version:
90		self.show_version()
91		return
92		self.workspace = workspace
93		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
94		# but there is no way to do that in process here since it's an
95		# overridden method. chdir is almost always an anti-pattern.
96		if self.workspace:
97		os.chdir(self.workspace.directory)
98		self.input_file_grp = input_file_grp
99		self.output_file_grp = output_file_grp
100		self.page_id = None if page_id == [] or page_id is None else page_id
101		parameterValidator = ParameterValidator(ocrd_tool)
102		report = parameterValidator.validate(parameter)
103		if not report.is_valid:
104		raise Exception("Invalid parameters %s" % report.errors)
105		self.parameter = parameter
106
107		def show_help(self):
108		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
109
110		def show_version(self):
111		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
112
113		def verify(self):
114		"""
115		Verify that the input fulfills the processor's requirements.
116		"""
117		return True
118
119		def process(self):
120		"""
121		Process the workspace
122		"""
123		raise Exception("Must be implemented")
124
125
126		def add_metadata(self, pcgts):
127		"""
128		Adds PAGE-XML MetadataItem describing the processing step
129		"""
130		pcgts.get_Metadata().add_MetadataItem(
131		MetadataItemType(type_="processingStep",
132		name=self.ocrd_tool['steps'][0],
133		value=self.ocrd_tool['executable'],
134		Labels=[LabelsType(
135		externalModel="ocrd-tool",
136		externalId="parameters",
137		Label=[LabelType(type_=name,
138		value=self.parameter[name])
139		for name in self.parameter.keys()]),
140		LabelsType(
141		externalModel="ocrd-tool",
142		externalId="version",
143		Label=[LabelType(type_=self.ocrd_tool['executable'],
144		value=self.version),
145		LabelType(type_='ocrd/core',
146		value=OCRD_VERSION)])
147		]))
148
149		def resolve_resource(self, parameter_name, val):
150		"""
151		Resolve a resource name to an absolute file path with the algorithm in
152		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
153
154		Args:
155		parameter_name (string): name of parameter to resolve resource for
156		val (string): resource value to resolve
157		"""
158		executable = self.ocrd_tool['executable']
159		try:
160		param = self.ocrd_tool['parameter'][parameter_name]
161		except KeyError:
162		raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
163		if not param['mimetype']:
164		raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
165		parameter_name)
166		if val.startswith('http:') or val.startswith('https:'):
167		cache_dir = join(XDG_CACHE_HOME, executable)
168		cache_key = re.sub('[^A-Za-z0-9]', '', val)
169		cache_fpath = join(cache_dir, cache_key)
170		# TODO Proper caching (make head request for size, If-Modified etc)
171		if not exists(cache_fpath):
172		if not isdir(cache_dir):
173		makedirs(cache_dir)
174		with requests.get(val, stream=True) as r:
175		with open(cache_fpath, 'wb') as f:
176		copyfileobj(r.raw, f)
177		return cache_fpath
178		ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
179		if ret:
180		return ret
181		bundled_fpath = resource_filename(__name__, val)
182		if exists(bundled_fpath):
183		return bundled_fpath
184		raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
185		(parameter_name, val))
186
187		def list_all_resources(self):
188		"""
189		List all resources found in the filesystem
190		"""
191		return list_all_resources(self.ocrd_tool['executable'])
192
193		@property
194		def input_files(self):
195		"""
196		List the input files (for single input file groups).
197
198		For each physical page:
199		- If there is a single PAGE-XML for the page, take it (and forget about all
200		other files for that page)
201		- Else if there is a single image file, take it (and forget about all other
202		files for that page)
203		- Otherwise raise an error (complaining that only PAGE-XML warrants
204		having multiple images for a single page)
205		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
206		"""
207		if not self.input_file_grp:
208		raise ValueError("Processor is missing input fileGrp")
209		ret = self.zip_input_files(mimetype=None, on_error='abort')
210		if not ret:
211		return []
212		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
213		return [tuples[0] for tuples in ret]
214
215		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
216		"""
217		List tuples of input files (for multiple input file groups).
218
219		Processors that expect/need multiple input file groups,
220		cannot use ``input_files``. They must align (zip) input files
221		across pages. This includes the case where not all pages
222		are equally present in all file groups. It also requires
223		making a consistent selection if there are multiple files
224		per page.
225
226		Following the OCR-D functional model, this function tries to
227		find a single PAGE file per page, or fall back to a single
228		image file per page. In either case, multiple matches per page
229		are an error (see error handling below).
230		This default behaviour can be changed by using a fixed MIME
231		type filter via ``mimetype``. But still, multiple matching
232		files per page are an error.
233
234		Single-page multiple-file errors are handled according to
235		``on_error``:
236		- if ``skip``, then the page for the respective fileGrp will be
237		silently skipped (as if there was no match at all)
238		- if ``first``, then the first matching file for the page will be
239		silently selected (as if the first was the only match)
240		- if ``last``, then the last matching file for the page will be
241		silently selected (as if the last was the only match)
242		- if ``abort``, then an exception will be raised.
243		Multiple matches for PAGE-XML will always raise an exception.
244
245		Args:
246		require_first (bool): If true, then skip a page entirely
247		whenever it is not available in the first input fileGrp.
248
249		mimetype (str): If not None, filter by the specified MIME
250		type (literal or regex prefixed by ``//``.
251		Otherwise prefer PAGE or image.
252		"""
253		if not self.input_file_grp:
254		raise ValueError("Processor is missing input fileGrp")
255
256		LOG = getLogger('ocrd.processor.base')
257		ifgs = self.input_file_grp.split(",")
258		# Iterating over all files repeatedly may seem inefficient at first sight,
259		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
260		# can actually be much more costly than traversing the ltree.
261		# This might depend on the number of pages vs number of fileGrps.
262
263		pages = dict()
264		for i, ifg in enumerate(ifgs):
265		for file_ in sorted(self.workspace.mets.find_all_files(
266		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
267		# sort by MIME type so PAGE comes before images
268		key=lambda file_: file_.mimetype):
269		if not file_.pageId:
270		continue
271		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
272		if ift[i]:
273		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
274		# fileGrp has multiple files for this page ID
275		if mimetype:
276		# filter was active, this must not happen
277	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
278		ift[i] = None
279		elif on_error == 'first':
280		pass # keep first match
281		elif on_error == 'last':
282		ift[i] = file_
283		elif on_error == 'abort':
284		raise ValueError(
285		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
286		mimetype, file_.pageId, ifg))
287		else:
288		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
289		elif (ift[i].mimetype == MIMETYPE_PAGE and
290		file_.mimetype != MIMETYPE_PAGE):
291		pass # keep PAGE match
292		elif (ift[i].mimetype == MIMETYPE_PAGE and
293		file_.mimetype == MIMETYPE_PAGE):
294		raise ValueError(
295		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
296		file_.pageId, ifg))
297		else:
298		# filter was inactive but no PAGE is in control, this must not happen
299	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
300		ift[i] = None
301		elif on_error == 'first':
302		pass # keep first match
303		elif on_error == 'last':
304		ift[i] = file_
305		elif on_error == 'abort':
306		raise ValueError(
307		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
308		file_.pageId, ifg))
309		else:
310		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
311		else:
312		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
313		ift[i] = file_
314		ifts = list()
315		for page, ifiles in pages.items():
316		for i, ifg in enumerate(ifgs):
317		if not ifiles[i]:
318		# other fallback options?
319		LOG.error('found no page %s in file group %s',
320		page, ifg)
321		if ifiles[0] or not require_first:
322		ifts.append(tuple(ifiles))
323		return ifts
324

OCR-D / core

Pull Request — master (#559)

ocrd.processor.base D

Complexity

Size/Duplication

Importance

10 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like