ocrd.processor.base - Code Metrics - Inspection of "Processor resource discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#559)

by Konstantin

created 2020-12-23 11:13 UTC

ocrd.processor.base B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	316
Duplicated Lines	7.59 %

Importance

Changes

Metric	Value
wmc	52
eloc	194
dl	24
loc	316
rs	7.44
c	0
b	0
f	0

10 Methods

Rating	Name	Duplication	Size	Complexity
F	Processor.zip_input_files()	24	109	25
A	Processor.show_version()	0	2	1
A	Processor.process()	0	5	1
D	Processor.__init__()	0	59	13
A	Processor.list_all_resources()	0	5	1
A	Processor.verify()	0	5	1
B	Processor.resolve_resource()	0	27	5
A	Processor.add_metadata()	0	21	1
A	Processor.input_files()	0	21	3
A	Processor.show_help()	0	2	1

How to fix Duplicated Code Complexity

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
import re
import sys
from pkg_resources import resource_filename

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    list_all_resources,
    XDG_CACHE_HOME
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
from ..resource_manager import OcrdResourceManager

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in list_all_resources(ocrd_tool['executable']):
                print(res)
            return
        if show_resource:
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
            else:
                with open(res_fname[0], 'rb') as f:
                    copyfileobj(f, sys.stdout.buffer)
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        if exists(val):
            return val
        ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
        if ret:
            return ret[0]
        resmgr = OcrdResourceManager()
        reslist = resmgr.find_resources(executable, name=val)
        if not reslist:
            reslist = resmgr.find_resources(executable, url=val)
        if not reslist:
            raise FileNotFoundError("Could not resolve '%s'" % val)
        _, resdict = reslist[0]
        return str(resmgr.download(
            executable,
            url=resdict['url'],
            name=resdict['name'],
            path_in_archive=resdict['path_in_archive'],
            resource_type=resdict['type']
        ))

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		import re
18		import sys
19		from pkg_resources import resource_filename
20
21		import requests
22
23		from ocrd_utils import (
24		VERSION as OCRD_VERSION,
25		MIMETYPE_PAGE,
26		getLogger,
27		initLogging,
28		list_resource_candidates,
29		list_all_resources,
30		XDG_CACHE_HOME
31		)
32		from ocrd_validators import ParameterValidator
33		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
34		from ..resource_manager import OcrdResourceManager
35
36		# XXX imports must remain for backwards-compatibilty
37		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
38
39		class Processor():
40		"""
41		A processor is an OCR-D compliant command-line-interface for executing
42		a single workflow step on the workspace (represented by local METS). It
43		reads input files for all or requested physical pages of the input fileGrp(s),
44		and writes output files for them into the output fileGrp(s). It may take
45		a number of optional or mandatory parameters.
46		"""
47
48		def __init__(
49		self,
50		workspace,
51		ocrd_tool=None,
52		parameter=None,
53		# TODO OCR-D/core#274
54		# input_file_grp=None,
55		# output_file_grp=None,
56		input_file_grp="INPUT",
57		output_file_grp="OUTPUT",
58		page_id=None,
59		show_resource=None,
60		list_resources=False,
61		show_help=False,
62		show_version=False,
63		dump_json=False,
64		version=None
65		):
66		if parameter is None:
67		parameter = {}
68		if dump_json:
69		print(json.dumps(ocrd_tool, indent=True))
70		return
71		if list_resources:
72		for res in list_all_resources(ocrd_tool['executable']):
73		print(res)
74		return
75		if show_resource:
76		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
77		if not res_fname:
78		initLogging()
79		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
80		logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
81		else:
82		with open(res_fname[0], 'rb') as f:
83		copyfileobj(f, sys.stdout.buffer)
84		return
85		self.ocrd_tool = ocrd_tool
86		if show_help:
87		self.show_help()
88		return
89		self.version = version
90		if show_version:
91		self.show_version()
92		return
93		self.workspace = workspace
94		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
95		# but there is no way to do that in process here since it's an
96		# overridden method. chdir is almost always an anti-pattern.
97		if self.workspace:
98		os.chdir(self.workspace.directory)
99		self.input_file_grp = input_file_grp
100		self.output_file_grp = output_file_grp
101		self.page_id = None if page_id == [] or page_id is None else page_id
102		parameterValidator = ParameterValidator(ocrd_tool)
103		report = parameterValidator.validate(parameter)
104		if not report.is_valid:
105		raise Exception("Invalid parameters %s" % report.errors)
106		self.parameter = parameter
107
108		def show_help(self):
109		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
110
111		def show_version(self):
112		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
113
114		def verify(self):
115		"""
116		Verify that the input fulfills the processor's requirements.
117		"""
118		return True
119
120		def process(self):
121		"""
122		Process the workspace
123		"""
124		raise Exception("Must be implemented")
125
126
127		def add_metadata(self, pcgts):
128		"""
129		Adds PAGE-XML MetadataItem describing the processing step
130		"""
131		pcgts.get_Metadata().add_MetadataItem(
132		MetadataItemType(type_="processingStep",
133		name=self.ocrd_tool['steps'][0],
134		value=self.ocrd_tool['executable'],
135		Labels=[LabelsType(
136		externalModel="ocrd-tool",
137		externalId="parameters",
138		Label=[LabelType(type_=name,
139		value=self.parameter[name])
140		for name in self.parameter.keys()]),
141		LabelsType(
142		externalModel="ocrd-tool",
143		externalId="version",
144		Label=[LabelType(type_=self.ocrd_tool['executable'],
145		value=self.version),
146		LabelType(type_='ocrd/core',
147		value=OCRD_VERSION)])
148		]))
149
150		def resolve_resource(self, val):
151		"""
152		Resolve a resource name to an absolute file path with the algorithm in
153		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
154
155		Args:
156		val (string): resource value to resolve
157		"""
158		executable = self.ocrd_tool['executable']
159		if exists(val):
160		return val
161		ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
162		if ret:
163		return ret[0]
164		resmgr = OcrdResourceManager()
165		reslist = resmgr.find_resources(executable, name=val)
166		if not reslist:
167		reslist = resmgr.find_resources(executable, url=val)
168		if not reslist:
169		raise FileNotFoundError("Could not resolve '%s'" % val)
170		_, resdict = reslist[0]
171		return str(resmgr.download(
172		executable,
173		url=resdict['url'],
174		name=resdict['name'],
175		path_in_archive=resdict['path_in_archive'],
176		resource_type=resdict['type']
177		))
178
179		def list_all_resources(self):
180		"""
181		List all resources found in the filesystem
182		"""
183		return list_all_resources(self.ocrd_tool['executable'])
184
185		@property
186		def input_files(self):
187		"""
188		List the input files (for single input file groups).
189
190		For each physical page:
191		- If there is a single PAGE-XML for the page, take it (and forget about all
192		other files for that page)
193		- Else if there is a single image file, take it (and forget about all other
194		files for that page)
195		- Otherwise raise an error (complaining that only PAGE-XML warrants
196		having multiple images for a single page)
197		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
198		"""
199		if not self.input_file_grp:
200		raise ValueError("Processor is missing input fileGrp")
201		ret = self.zip_input_files(mimetype=None, on_error='abort')
202		if not ret:
203		return []
204		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
205		return [tuples[0] for tuples in ret]
206
207		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
208		"""
209		List tuples of input files (for multiple input file groups).
210
211		Processors that expect/need multiple input file groups,
212		cannot use ``input_files``. They must align (zip) input files
213		across pages. This includes the case where not all pages
214		are equally present in all file groups. It also requires
215		making a consistent selection if there are multiple files
216		per page.
217
218		Following the OCR-D functional model, this function tries to
219		find a single PAGE file per page, or fall back to a single
220		image file per page. In either case, multiple matches per page
221		are an error (see error handling below).
222		This default behaviour can be changed by using a fixed MIME
223		type filter via ``mimetype``. But still, multiple matching
224		files per page are an error.
225
226		Single-page multiple-file errors are handled according to
227		``on_error``:
228		- if ``skip``, then the page for the respective fileGrp will be
229		silently skipped (as if there was no match at all)
230		- if ``first``, then the first matching file for the page will be
231		silently selected (as if the first was the only match)
232		- if ``last``, then the last matching file for the page will be
233		silently selected (as if the last was the only match)
234		- if ``abort``, then an exception will be raised.
235		Multiple matches for PAGE-XML will always raise an exception.
236
237		Args:
238		require_first (bool): If true, then skip a page entirely
239		whenever it is not available in the first input fileGrp.
240
241		mimetype (str): If not None, filter by the specified MIME
242		type (literal or regex prefixed by ``//``.
243		Otherwise prefer PAGE or image.
244		"""
245		if not self.input_file_grp:
246		raise ValueError("Processor is missing input fileGrp")
247
248		LOG = getLogger('ocrd.processor.base')
249		ifgs = self.input_file_grp.split(",")
250		# Iterating over all files repeatedly may seem inefficient at first sight,
251		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
252		# can actually be much more costly than traversing the ltree.
253		# This might depend on the number of pages vs number of fileGrps.
254
255		pages = dict()
256		for i, ifg in enumerate(ifgs):
257		for file_ in sorted(self.workspace.mets.find_all_files(
258		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
259		# sort by MIME type so PAGE comes before images
260		key=lambda file_: file_.mimetype):
261		if not file_.pageId:
262		continue
263		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
264		if ift[i]:
265		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
266		# fileGrp has multiple files for this page ID
267		if mimetype:
268		# filter was active, this must not happen
269	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
270		ift[i] = None
271		elif on_error == 'first':
272		pass # keep first match
273		elif on_error == 'last':
274		ift[i] = file_
275		elif on_error == 'abort':
276		raise ValueError(
277		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
278		mimetype, file_.pageId, ifg))
279		else:
280		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
281		elif (ift[i].mimetype == MIMETYPE_PAGE and
282		file_.mimetype != MIMETYPE_PAGE):
283		pass # keep PAGE match
284		elif (ift[i].mimetype == MIMETYPE_PAGE and
285		file_.mimetype == MIMETYPE_PAGE):
286		raise ValueError(
287		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
288		file_.pageId, ifg))
289		else:
290		# filter was inactive but no PAGE is in control, this must not happen
291	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
292		ift[i] = None
293		elif on_error == 'first':
294		pass # keep first match
295		elif on_error == 'last':
296		ift[i] = file_
297		elif on_error == 'abort':
298		raise ValueError(
299		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
300		file_.pageId, ifg))
301		else:
302		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
303		else:
304		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
305		ift[i] = file_
306		ifts = list()
307		for page, ifiles in pages.items():
308		for i, ifg in enumerate(ifgs):
309		if not ifiles[i]:
310		# other fallback options?
311		LOG.error('found no page %s in file group %s',
312		page, ifg)
313		if ifiles[0] or not require_first:
314		ifts.append(tuple(ifiles))
315		return ifts
316

OCR-D / core

Pull Request — master (#559)

ocrd.processor.base B

Complexity

Size/Duplication

Importance

10 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like