ocrd.processor.base.Processor.add_metadata() - Code Metrics - Inspection of "Processor resource discovery" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#559)

by Konstantin

created 2021-01-25 11:35 UTC

ocrd.processor.base.Processor.add_metadata() A

↳ Parent: ocrd.processor.base

Complexity

Conditions

Size

Total Lines	21
Code Lines	18

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	18
dl	0
loc	21
rs	9.5
c	0
b	0
f	0
cc	1
nop	2

"""
Processor base class and helper functions
"""

__all__ = [
    'Processor',
    'generate_processor_help',
    'run_cli',
    'run_processor'
]

from os import makedirs
from os.path import exists, isdir, join
from shutil import copyfileobj
import json
import os
import re
import sys

import requests

from ocrd_utils import (
    VERSION as OCRD_VERSION,
    MIMETYPE_PAGE,
    getLogger,
    initLogging,
    list_resource_candidates,
    list_all_resources,
)
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
from ..resource_manager import OcrdResourceManager

# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

class Processor():
    """
    A processor is an OCR-D compliant command-line-interface for executing
    a single workflow step on the workspace (represented by local METS). It
    reads input files for all or requested physical pages of the input fileGrp(s),
    and writes output files for them into the output fileGrp(s). It may take 
    a number of optional or mandatory parameters.
    """

    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None
    ):
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            for res in list_all_resources(ocrd_tool['executable']):
                print(res)
            return
        if show_resource:
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
            else:
                with open(res_fname[0], 'rb') as f:
                    copyfileobj(f, sys.stdout.buffer)
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter

    def show_help(self):
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))

    def show_version(self):
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))

    def verify(self):
        """
        Verify that the input fulfills the processor's requirements.
        """
        return True

    def process(self):
        """
        Process the workspace
        """
        raise Exception("Must be implemented")


    def add_metadata(self, pcgts):
        """
        Adds PAGE-XML MetadataItem describing the processing step
        """
        pcgts.get_Metadata().add_MetadataItem(
                MetadataItemType(type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=self.ocrd_tool['executable'],
                    Labels=[LabelsType(
                        externalModel="ocrd-tool",
                        externalId="parameters",
                        Label=[LabelType(type_=name,
                                         value=self.parameter[name])
                               for name in self.parameter.keys()]),
                            LabelsType(
                        externalModel="ocrd-tool",
                        externalId="version",
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
                                         value=self.version),
                               LabelType(type_='ocrd/core',
                                         value=OCRD_VERSION)])
                    ]))

    def resolve_resource(self, val):
        """
        Resolve a resource name to an absolute file path with the algorithm in
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters

        Args:
            val (string): resource value to resolve
        """
        executable = self.ocrd_tool['executable']
        if exists(val):
            return val
        ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
        if ret:
            return ret[0]
        resmgr = OcrdResourceManager()
        reslist = resmgr.find_resources(executable, name=val)
        if not reslist:
            reslist = resmgr.find_resources(executable, url=val)
        if not reslist:
            raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
        _, resdict = reslist[0]
        return str(resmgr.download(
            executable,
            url=resdict['url'],
            name=resdict['name'],
            path_in_archive=resdict['path_in_archive'],
            resource_type=resdict['type']
        ))

    def list_all_resources(self):
        """
        List all resources found in the filesystem
        """
        return list_all_resources(self.ocrd_tool['executable'])

    @property
    def input_files(self):
        """
        List the input files (for single input file groups).

        For each physical page:
        - If there is a single PAGE-XML for the page, take it (and forget about all
          other files for that page)
        - Else if there is a single image file, take it (and forget about all other
          files for that page)
        - Otherwise raise an error (complaining that only PAGE-XML warrants
          having multiple images for a single page)
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")
        ret = self.zip_input_files(mimetype=None, on_error='abort')
        if not ret:
            return []
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
        return [tuples[0] for tuples in ret]

    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
        """
        List tuples of input files (for multiple input file groups).

        Processors that expect/need multiple input file groups,
        cannot use ``input_files``. They must align (zip) input files
        across pages. This includes the case where not all pages
        are equally present in all file groups. It also requires
        making a consistent selection if there are multiple files
        per page.

        Following the OCR-D functional model, this function tries to
        find a single PAGE file per page, or fall back to a single
        image file per page. In either case, multiple matches per page
        are an error (see error handling below).
        This default behaviour can be changed by using a fixed MIME
        type filter via ``mimetype``. But still, multiple matching
        files per page are an error.

        Single-page multiple-file errors are handled according to
        ``on_error``:
        - if ``skip``, then the page for the respective fileGrp will be
          silently skipped (as if there was no match at all)
        - if ``first``, then the first matching file for the page will be
          silently selected (as if the first was the only match)
        - if ``last``, then the last matching file for the page will be
          silently selected (as if the last was the only match)
        - if ``abort``, then an exception will be raised.
        Multiple matches for PAGE-XML will always raise an exception.

        Args:
             require_first (bool): If true, then skip a page entirely
             whenever it is not available in the first input fileGrp.

             mimetype (str): If not None, filter by the specified MIME
             type (literal or regex prefixed by ``//``.
             Otherwise prefer PAGE or image.
        """
        if not self.input_file_grp:
            raise ValueError("Processor is missing input fileGrp")

        LOG = getLogger('ocrd.processor.base')
        ifgs = self.input_file_grp.split(",")
        # Iterating over all files repeatedly may seem inefficient at first sight,
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
        # can actually be much more costly than traversing the ltree.
        # This might depend on the number of pages vs number of fileGrps.

        pages = dict()
        for i, ifg in enumerate(ifgs):
            for file_ in sorted(self.workspace.mets.find_all_files(
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                # sort by MIME type so PAGE comes before images
                                key=lambda file_: file_.mimetype):
                if not file_.pageId:
                    continue
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                if ift[i]:
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
                    # fileGrp has multiple files for this page ID
                    if mimetype:
                        # filter was active, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
                                    mimetype, file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype != MIMETYPE_PAGE):
                        pass # keep PAGE match
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
                          file_.mimetype == MIMETYPE_PAGE):
                            raise ValueError(
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
                                    file_.pageId, ifg))
                    else:
                        # filter was inactive but no PAGE is in control, this must not happen
                        if on_error == 'skip':

                            ift[i] = None
                        elif on_error == 'first':
                            pass # keep first match
                        elif on_error == 'last':
                            ift[i] = file_
                        elif on_error == 'abort':
                            raise ValueError(
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
                                    file_.pageId, ifg))
                        else:
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                else:
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                    ift[i] = file_
        ifts = list()
        for page, ifiles in pages.items():
            for i, ifg in enumerate(ifgs):
                if not ifiles[i]:
                    # other fallback options?
                    LOG.error('found no page %s in file group %s',
                              page, ifg)
            if ifiles[0] or not require_first:
                ifts.append(tuple(ifiles))
        return ifts


1		"""
2		Processor base class and helper functions
3		"""
4
5		__all__ = [
6		'Processor',
7		'generate_processor_help',
8		'run_cli',
9		'run_processor'
10		]
11
12		from os import makedirs
13		from os.path import exists, isdir, join
14		from shutil import copyfileobj
15		import json
16		import os
17		import re
18		import sys
19
20		import requests
21
22		from ocrd_utils import (
23		VERSION as OCRD_VERSION,
24		MIMETYPE_PAGE,
25		getLogger,
26		initLogging,
27		list_resource_candidates,
28		list_all_resources,
29		)
30		from ocrd_validators import ParameterValidator
31		from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
32		from ..resource_manager import OcrdResourceManager
33
34		# XXX imports must remain for backwards-compatibilty
35		from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
36
37		class Processor():
38		"""
39		A processor is an OCR-D compliant command-line-interface for executing
40		a single workflow step on the workspace (represented by local METS). It
41		reads input files for all or requested physical pages of the input fileGrp(s),
42		and writes output files for them into the output fileGrp(s). It may take
43		a number of optional or mandatory parameters.
44		"""
45
46		def __init__(
47		self,
48		workspace,
49		ocrd_tool=None,
50		parameter=None,
51		# TODO OCR-D/core#274
52		# input_file_grp=None,
53		# output_file_grp=None,
54		input_file_grp="INPUT",
55		output_file_grp="OUTPUT",
56		page_id=None,
57		show_resource=None,
58		list_resources=False,
59		show_help=False,
60		show_version=False,
61		dump_json=False,
62		version=None
63		):
64		if parameter is None:
65		parameter = {}
66		if dump_json:
67		print(json.dumps(ocrd_tool, indent=True))
68		return
69		if list_resources:
70		for res in list_all_resources(ocrd_tool['executable']):
71		print(res)
72		return
73		if show_resource:
74		res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
75		if not res_fname:
76		initLogging()
77		logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
78		logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
79		else:
80		with open(res_fname[0], 'rb') as f:
81		copyfileobj(f, sys.stdout.buffer)
82		return
83		self.ocrd_tool = ocrd_tool
84		if show_help:
85		self.show_help()
86		return
87		self.version = version
88		if show_version:
89		self.show_version()
90		return
91		self.workspace = workspace
92		# FIXME HACK would be better to use pushd_popd(self.workspace.directory)
93		# but there is no way to do that in process here since it's an
94		# overridden method. chdir is almost always an anti-pattern.
95		if self.workspace:
96		os.chdir(self.workspace.directory)
97		self.input_file_grp = input_file_grp
98		self.output_file_grp = output_file_grp
99		self.page_id = None if page_id == [] or page_id is None else page_id
100		parameterValidator = ParameterValidator(ocrd_tool)
101		report = parameterValidator.validate(parameter)
102		if not report.is_valid:
103		raise Exception("Invalid parameters %s" % report.errors)
104		self.parameter = parameter
105
106		def show_help(self):
107		print(generate_processor_help(self.ocrd_tool, processor_instance=self))
108
109		def show_version(self):
110		print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
111
112		def verify(self):
113		"""
114		Verify that the input fulfills the processor's requirements.
115		"""
116		return True
117
118		def process(self):
119		"""
120		Process the workspace
121		"""
122		raise Exception("Must be implemented")
123
124
125		def add_metadata(self, pcgts):
126		"""
127		Adds PAGE-XML MetadataItem describing the processing step
128		"""
129		pcgts.get_Metadata().add_MetadataItem(
130		MetadataItemType(type_="processingStep",
131		name=self.ocrd_tool['steps'][0],
132		value=self.ocrd_tool['executable'],
133		Labels=[LabelsType(
134		externalModel="ocrd-tool",
135		externalId="parameters",
136		Label=[LabelType(type_=name,
137		value=self.parameter[name])
138		for name in self.parameter.keys()]),
139		LabelsType(
140		externalModel="ocrd-tool",
141		externalId="version",
142		Label=[LabelType(type_=self.ocrd_tool['executable'],
143		value=self.version),
144		LabelType(type_='ocrd/core',
145		value=OCRD_VERSION)])
146		]))
147
148		def resolve_resource(self, val):
149		"""
150		Resolve a resource name to an absolute file path with the algorithm in
151		https://ocr-d.de/en/spec/ocrd_tool#file-parameters
152
153		Args:
154		val (string): resource value to resolve
155		"""
156		executable = self.ocrd_tool['executable']
157		if exists(val):
158		return val
159		ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
160		if ret:
161		return ret[0]
162		resmgr = OcrdResourceManager()
163		reslist = resmgr.find_resources(executable, name=val)
164		if not reslist:
165		reslist = resmgr.find_resources(executable, url=val)
166		if not reslist:
167		raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
168		_, resdict = reslist[0]
169		return str(resmgr.download(
170		executable,
171		url=resdict['url'],
172		name=resdict['name'],
173		path_in_archive=resdict['path_in_archive'],
174		resource_type=resdict['type']
175		))
176
177		def list_all_resources(self):
178		"""
179		List all resources found in the filesystem
180		"""
181		return list_all_resources(self.ocrd_tool['executable'])
182
183		@property
184		def input_files(self):
185		"""
186		List the input files (for single input file groups).
187
188		For each physical page:
189		- If there is a single PAGE-XML for the page, take it (and forget about all
190		other files for that page)
191		- Else if there is a single image file, take it (and forget about all other
192		files for that page)
193		- Otherwise raise an error (complaining that only PAGE-XML warrants
194		having multiple images for a single page)
195		(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
196		"""
197		if not self.input_file_grp:
198		raise ValueError("Processor is missing input fileGrp")
199		ret = self.zip_input_files(mimetype=None, on_error='abort')
200		if not ret:
201		return []
202		assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
203		return [tuples[0] for tuples in ret]
204
205		def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
206		"""
207		List tuples of input files (for multiple input file groups).
208
209		Processors that expect/need multiple input file groups,
210		cannot use ``input_files``. They must align (zip) input files
211		across pages. This includes the case where not all pages
212		are equally present in all file groups. It also requires
213		making a consistent selection if there are multiple files
214		per page.
215
216		Following the OCR-D functional model, this function tries to
217		find a single PAGE file per page, or fall back to a single
218		image file per page. In either case, multiple matches per page
219		are an error (see error handling below).
220		This default behaviour can be changed by using a fixed MIME
221		type filter via ``mimetype``. But still, multiple matching
222		files per page are an error.
223
224		Single-page multiple-file errors are handled according to
225		``on_error``:
226		- if ``skip``, then the page for the respective fileGrp will be
227		silently skipped (as if there was no match at all)
228		- if ``first``, then the first matching file for the page will be
229		silently selected (as if the first was the only match)
230		- if ``last``, then the last matching file for the page will be
231		silently selected (as if the last was the only match)
232		- if ``abort``, then an exception will be raised.
233		Multiple matches for PAGE-XML will always raise an exception.
234
235		Args:
236		require_first (bool): If true, then skip a page entirely
237		whenever it is not available in the first input fileGrp.
238
239		mimetype (str): If not None, filter by the specified MIME
240		type (literal or regex prefixed by ``//``.
241		Otherwise prefer PAGE or image.
242		"""
243		if not self.input_file_grp:
244		raise ValueError("Processor is missing input fileGrp")
245
246		LOG = getLogger('ocrd.processor.base')
247		ifgs = self.input_file_grp.split(",")
248		# Iterating over all files repeatedly may seem inefficient at first sight,
249		# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
250		# can actually be much more costly than traversing the ltree.
251		# This might depend on the number of pages vs number of fileGrps.
252
253		pages = dict()
254		for i, ifg in enumerate(ifgs):
255		for file_ in sorted(self.workspace.mets.find_all_files(
256		pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
257		# sort by MIME type so PAGE comes before images
258		key=lambda file_: file_.mimetype):
259		if not file_.pageId:
260		continue
261		ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
262		if ift[i]:
263		LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
264		# fileGrp has multiple files for this page ID
265		if mimetype:
266		# filter was active, this must not happen
267	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
268		ift[i] = None
269		elif on_error == 'first':
270		pass # keep first match
271		elif on_error == 'last':
272		ift[i] = file_
273		elif on_error == 'abort':
274		raise ValueError(
275		"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
276		mimetype, file_.pageId, ifg))
277		else:
278		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
279		elif (ift[i].mimetype == MIMETYPE_PAGE and
280		file_.mimetype != MIMETYPE_PAGE):
281		pass # keep PAGE match
282		elif (ift[i].mimetype == MIMETYPE_PAGE and
283		file_.mimetype == MIMETYPE_PAGE):
284		raise ValueError(
285		"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
286		file_.pageId, ifg))
287		else:
288		# filter was inactive but no PAGE is in control, this must not happen
289	View Code Duplication	if on_error == 'skip':
		0 ignored issues – show Duplication introduced 2020-11-02 16:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
290		ift[i] = None
291		elif on_error == 'first':
292		pass # keep first match
293		elif on_error == 'last':
294		ift[i] = file_
295		elif on_error == 'abort':
296		raise ValueError(
297		"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
298		file_.pageId, ifg))
299		else:
300		raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
301		else:
302		LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
303		ift[i] = file_
304		ifts = list()
305		for page, ifiles in pages.items():
306		for i, ifg in enumerate(ifgs):
307		if not ifiles[i]:
308		# other fallback options?
309		LOG.error('found no page %s in file group %s',
310		page, ifg)
311		if ifiles[0] or not require_first:
312		ifts.append(tuple(ifiles))
313		return ifts
314

OCR-D / core

Pull Request — master (#559)

ocrd.processor.base.Processor.add_metadata() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like