Passed
Pull Request — master (#559)
by Konstantin
02:32
created

ocrd.processor.base.Processor.add_metadata()   A

Complexity

Conditions 1

Size

Total Lines 21
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 21
rs 9.5
c 0
b 0
f 0
cc 1
nop 2
1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os import makedirs
13
from os.path import exists, isdir, join
14
from shutil import copyfileobj
15
import json
16
import os
17
import re
18
import sys
19
20
import requests
21
22
from ocrd_utils import (
23
    VERSION as OCRD_VERSION,
24
    MIMETYPE_PAGE,
25
    getLogger,
26
    initLogging,
27
    list_resource_candidates,
28
    list_all_resources,
29
)
30
from ocrd_validators import ParameterValidator
31
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
32
from ..resource_manager import OcrdResourceManager
33
34
# XXX imports must remain for backwards-compatibilty
35
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
36
37
class Processor():
38
    """
39
    A processor is an OCR-D compliant command-line-interface for executing
40
    a single workflow step on the workspace (represented by local METS). It
41
    reads input files for all or requested physical pages of the input fileGrp(s),
42
    and writes output files for them into the output fileGrp(s). It may take 
43
    a number of optional or mandatory parameters.
44
    """
45
46
    def __init__(
47
            self,
48
            workspace,
49
            ocrd_tool=None,
50
            parameter=None,
51
            # TODO OCR-D/core#274
52
            # input_file_grp=None,
53
            # output_file_grp=None,
54
            input_file_grp="INPUT",
55
            output_file_grp="OUTPUT",
56
            page_id=None,
57
            show_resource=None,
58
            list_resources=False,
59
            show_help=False,
60
            show_version=False,
61
            dump_json=False,
62
            version=None
63
    ):
64
        if parameter is None:
65
            parameter = {}
66
        if dump_json:
67
            print(json.dumps(ocrd_tool, indent=True))
68
            return
69
        if list_resources:
70
            for res in list_all_resources(ocrd_tool['executable']):
71
                print(res)
72
            return
73
        if show_resource:
74
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
75
            if not res_fname:
76
                initLogging()
77
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
78
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
79
            else:
80
                with open(res_fname[0], 'rb') as f:
81
                    copyfileobj(f, sys.stdout.buffer)
82
            return
83
        self.ocrd_tool = ocrd_tool
84
        if show_help:
85
            self.show_help()
86
            return
87
        self.version = version
88
        if show_version:
89
            self.show_version()
90
            return
91
        self.workspace = workspace
92
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
93
        # but there is no way to do that in process here since it's an
94
        # overridden method. chdir is almost always an anti-pattern.
95
        if self.workspace:
96
            os.chdir(self.workspace.directory)
97
        self.input_file_grp = input_file_grp
98
        self.output_file_grp = output_file_grp
99
        self.page_id = None if page_id == [] or page_id is None else page_id
100
        parameterValidator = ParameterValidator(ocrd_tool)
101
        report = parameterValidator.validate(parameter)
102
        if not report.is_valid:
103
            raise Exception("Invalid parameters %s" % report.errors)
104
        self.parameter = parameter
105
106
    def show_help(self):
107
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
108
109
    def show_version(self):
110
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
111
112
    def verify(self):
113
        """
114
        Verify that the input fulfills the processor's requirements.
115
        """
116
        return True
117
118
    def process(self):
119
        """
120
        Process the workspace
121
        """
122
        raise Exception("Must be implemented")
123
124
125
    def add_metadata(self, pcgts):
126
        """
127
        Adds PAGE-XML MetadataItem describing the processing step
128
        """
129
        pcgts.get_Metadata().add_MetadataItem(
130
                MetadataItemType(type_="processingStep",
131
                    name=self.ocrd_tool['steps'][0],
132
                    value=self.ocrd_tool['executable'],
133
                    Labels=[LabelsType(
134
                        externalModel="ocrd-tool",
135
                        externalId="parameters",
136
                        Label=[LabelType(type_=name,
137
                                         value=self.parameter[name])
138
                               for name in self.parameter.keys()]),
139
                            LabelsType(
140
                        externalModel="ocrd-tool",
141
                        externalId="version",
142
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
143
                                         value=self.version),
144
                               LabelType(type_='ocrd/core',
145
                                         value=OCRD_VERSION)])
146
                    ]))
147
148
    def resolve_resource(self, val):
149
        """
150
        Resolve a resource name to an absolute file path with the algorithm in
151
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
152
153
        Args:
154
            val (string): resource value to resolve
155
        """
156
        executable = self.ocrd_tool['executable']
157
        if exists(val):
158
            return val
159
        ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
160
        if ret:
161
            return ret[0]
162
        resmgr = OcrdResourceManager()
163
        reslist = resmgr.find_resources(executable, name=val)
164
        if not reslist:
165
            reslist = resmgr.find_resources(executable, url=val)
166
        if not reslist:
167
            raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
168
        _, resdict = reslist[0]
169
        return str(resmgr.download(
170
            executable,
171
            url=resdict['url'],
172
            name=resdict['name'],
173
            path_in_archive=resdict['path_in_archive'],
174
            resource_type=resdict['type']
175
        ))
176
177
    def list_all_resources(self):
178
        """
179
        List all resources found in the filesystem
180
        """
181
        return list_all_resources(self.ocrd_tool['executable'])
182
183
    @property
184
    def input_files(self):
185
        """
186
        List the input files (for single input file groups).
187
188
        For each physical page:
189
        - If there is a single PAGE-XML for the page, take it (and forget about all
190
          other files for that page)
191
        - Else if there is a single image file, take it (and forget about all other
192
          files for that page)
193
        - Otherwise raise an error (complaining that only PAGE-XML warrants
194
          having multiple images for a single page)
195
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
196
        """
197
        if not self.input_file_grp:
198
            raise ValueError("Processor is missing input fileGrp")
199
        ret = self.zip_input_files(mimetype=None, on_error='abort')
200
        if not ret:
201
            return []
202
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
203
        return [tuples[0] for tuples in ret]
204
205
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
206
        """
207
        List tuples of input files (for multiple input file groups).
208
209
        Processors that expect/need multiple input file groups,
210
        cannot use ``input_files``. They must align (zip) input files
211
        across pages. This includes the case where not all pages
212
        are equally present in all file groups. It also requires
213
        making a consistent selection if there are multiple files
214
        per page.
215
216
        Following the OCR-D functional model, this function tries to
217
        find a single PAGE file per page, or fall back to a single
218
        image file per page. In either case, multiple matches per page
219
        are an error (see error handling below).
220
        This default behaviour can be changed by using a fixed MIME
221
        type filter via ``mimetype``. But still, multiple matching
222
        files per page are an error.
223
224
        Single-page multiple-file errors are handled according to
225
        ``on_error``:
226
        - if ``skip``, then the page for the respective fileGrp will be
227
          silently skipped (as if there was no match at all)
228
        - if ``first``, then the first matching file for the page will be
229
          silently selected (as if the first was the only match)
230
        - if ``last``, then the last matching file for the page will be
231
          silently selected (as if the last was the only match)
232
        - if ``abort``, then an exception will be raised.
233
        Multiple matches for PAGE-XML will always raise an exception.
234
235
        Args:
236
             require_first (bool): If true, then skip a page entirely
237
             whenever it is not available in the first input fileGrp.
238
239
             mimetype (str): If not None, filter by the specified MIME
240
             type (literal or regex prefixed by ``//``.
241
             Otherwise prefer PAGE or image.
242
        """
243
        if not self.input_file_grp:
244
            raise ValueError("Processor is missing input fileGrp")
245
246
        LOG = getLogger('ocrd.processor.base')
247
        ifgs = self.input_file_grp.split(",")
248
        # Iterating over all files repeatedly may seem inefficient at first sight,
249
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
250
        # can actually be much more costly than traversing the ltree.
251
        # This might depend on the number of pages vs number of fileGrps.
252
253
        pages = dict()
254
        for i, ifg in enumerate(ifgs):
255
            for file_ in sorted(self.workspace.mets.find_all_files(
256
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
257
                                # sort by MIME type so PAGE comes before images
258
                                key=lambda file_: file_.mimetype):
259
                if not file_.pageId:
260
                    continue
261
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
262
                if ift[i]:
263
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
264
                    # fileGrp has multiple files for this page ID
265
                    if mimetype:
266
                        # filter was active, this must not happen
267 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
268
                            ift[i] = None
269
                        elif on_error == 'first':
270
                            pass # keep first match
271
                        elif on_error == 'last':
272
                            ift[i] = file_
273
                        elif on_error == 'abort':
274
                            raise ValueError(
275
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
276
                                    mimetype, file_.pageId, ifg))
277
                        else:
278
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
279
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
280
                          file_.mimetype != MIMETYPE_PAGE):
281
                        pass # keep PAGE match
282
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
283
                          file_.mimetype == MIMETYPE_PAGE):
284
                            raise ValueError(
285
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
286
                                    file_.pageId, ifg))
287
                    else:
288
                        # filter was inactive but no PAGE is in control, this must not happen
289 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
290
                            ift[i] = None
291
                        elif on_error == 'first':
292
                            pass # keep first match
293
                        elif on_error == 'last':
294
                            ift[i] = file_
295
                        elif on_error == 'abort':
296
                            raise ValueError(
297
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
298
                                    file_.pageId, ifg))
299
                        else:
300
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
301
                else:
302
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
303
                    ift[i] = file_
304
        ifts = list()
305
        for page, ifiles in pages.items():
306
            for i, ifg in enumerate(ifgs):
307
                if not ifiles[i]:
308
                    # other fallback options?
309
                    LOG.error('found no page %s in file group %s',
310
                              page, ifg)
311
            if ifiles[0] or not require_first:
312
                ifts.append(tuple(ifiles))
313
        return ifts
314