Passed
Pull Request — master (#663)
by Konstantin
01:53
created

ocrd.processor.base.Processor.input_files()   A

Complexity

Conditions 3

Size

Total Lines 21
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 21
rs 9.95
c 0
b 0
f 0
cc 3
nop 1
1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os import makedirs
13
from os.path import exists, isdir, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
import re
19
import sys
20
21
import requests
22
23
from ocrd_utils import (
24
    VERSION as OCRD_VERSION,
25
    MIMETYPE_PAGE,
26
    getLogger,
27
    initLogging,
28
    list_resource_candidates,
29
    list_all_resources,
30
)
31
from ocrd_validators import ParameterValidator
32
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
33
from ..resource_manager import OcrdResourceManager
34
35
# XXX imports must remain for backwards-compatibilty
36
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38
class Processor():
39
    """
40
    A processor is an OCR-D compliant command-line-interface for executing
41
    a single workflow step on the workspace (represented by local METS). It
42
    reads input files for all or requested physical pages of the input fileGrp(s),
43
    and writes output files for them into the output fileGrp(s). It may take 
44
    a number of optional or mandatory parameters.
45
    """
46
47
    def __init__(
48
            self,
49
            workspace,
50
            ocrd_tool=None,
51
            parameter=None,
52
            # TODO OCR-D/core#274
53
            # input_file_grp=None,
54
            # output_file_grp=None,
55
            input_file_grp="INPUT",
56
            output_file_grp="OUTPUT",
57
            page_id=None,
58
            show_resource=None,
59
            list_resources=False,
60
            show_help=False,
61
            show_version=False,
62
            dump_json=False,
63
            version=None
64
    ):
65
        if parameter is None:
66
            parameter = {}
67
        if dump_json:
68
            print(json.dumps(ocrd_tool, indent=True))
69
            return
70
        if list_resources:
71
            for res in list_all_resources(ocrd_tool['executable']):
72
                print(res)
73
            return
74
        if show_resource:
75
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
76
            if not res_fname:
77
                initLogging()
78
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
79
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
80
            else:
81
                with open(res_fname[0], 'rb') as f:
82
                    copyfileobj(f, sys.stdout.buffer)
83
            return
84
        self.ocrd_tool = ocrd_tool
85
        if show_help:
86
            self.show_help()
87
            return
88
        self.version = version
89
        if show_version:
90
            self.show_version()
91
            return
92
        self.workspace = workspace
93
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
94
        # but there is no way to do that in process here since it's an
95
        # overridden method. chdir is almost always an anti-pattern.
96
        if self.workspace:
97
            self.old_pwd = getcwd()
98
            os.chdir(self.workspace.directory)
99
        self.input_file_grp = input_file_grp
100
        self.output_file_grp = output_file_grp
101
        self.page_id = None if page_id == [] or page_id is None else page_id
102
        parameterValidator = ParameterValidator(ocrd_tool)
103
        report = parameterValidator.validate(parameter)
104
        if not report.is_valid:
105
            raise Exception("Invalid parameters %s" % report.errors)
106
        self.parameter = parameter
107
108
    def show_help(self):
109
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
110
111
    def show_version(self):
112
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
113
114
    def verify(self):
115
        """
116
        Verify that the input fulfills the processor's requirements.
117
        """
118
        return True
119
120
    def process(self):
121
        """
122
        Process the workspace
123
        """
124
        raise Exception("Must be implemented")
125
126
127
    def add_metadata(self, pcgts):
128
        """
129
        Adds PAGE-XML MetadataItem describing the processing step
130
        """
131
        pcgts.get_Metadata().add_MetadataItem(
132
                MetadataItemType(type_="processingStep",
133
                    name=self.ocrd_tool['steps'][0],
134
                    value=self.ocrd_tool['executable'],
135
                    Labels=[LabelsType(
136
                        externalModel="ocrd-tool",
137
                        externalId="parameters",
138
                        Label=[LabelType(type_=name,
139
                                         value=self.parameter[name])
140
                               for name in self.parameter.keys()]),
141
                            LabelsType(
142
                        externalModel="ocrd-tool",
143
                        externalId="version",
144
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
145
                                         value=self.version),
146
                               LabelType(type_='ocrd/core',
147
                                         value=OCRD_VERSION)])
148
                    ]))
149
150
    def resolve_resource(self, val):
151
        """
152
        Resolve a resource name to an absolute file path with the algorithm in
153
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
154
155
        Args:
156
            val (string): resource value to resolve
157
        """
158
        executable = self.ocrd_tool['executable']
159
        log = getLogger('ocrd.%s.resolve_resource' % executable)
160
        if exists(val):
161
            log.debug("Resolved to absolute path %s" % val)
162
            return val
163
        ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
164
        if ret:
165
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
166
            return ret[0]
167
        log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource.",
168
                val, executable, executable, val)
169
        sys.exit(1)
170
171
    def list_all_resources(self):
172
        """
173
        List all resources found in the filesystem
174
        """
175
        return list_all_resources(self.ocrd_tool['executable'])
176
177
    @property
178
    def input_files(self):
179
        """
180
        List the input files (for single input file groups).
181
182
        For each physical page:
183
        - If there is a single PAGE-XML for the page, take it (and forget about all
184
          other files for that page)
185
        - Else if there is a single image file, take it (and forget about all other
186
          files for that page)
187
        - Otherwise raise an error (complaining that only PAGE-XML warrants
188
          having multiple images for a single page)
189
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
190
        """
191
        if not self.input_file_grp:
192
            raise ValueError("Processor is missing input fileGrp")
193
        ret = self.zip_input_files(mimetype=None, on_error='abort')
194
        if not ret:
195
            return []
196
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
197
        return [tuples[0] for tuples in ret]
198
199
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
200
        """
201
        List tuples of input files (for multiple input file groups).
202
203
        Processors that expect/need multiple input file groups,
204
        cannot use ``input_files``. They must align (zip) input files
205
        across pages. This includes the case where not all pages
206
        are equally present in all file groups. It also requires
207
        making a consistent selection if there are multiple files
208
        per page.
209
210
        Following the OCR-D functional model, this function tries to
211
        find a single PAGE file per page, or fall back to a single
212
        image file per page. In either case, multiple matches per page
213
        are an error (see error handling below).
214
        This default behaviour can be changed by using a fixed MIME
215
        type filter via ``mimetype``. But still, multiple matching
216
        files per page are an error.
217
218
        Single-page multiple-file errors are handled according to
219
        ``on_error``:
220
        - if ``skip``, then the page for the respective fileGrp will be
221
          silently skipped (as if there was no match at all)
222
        - if ``first``, then the first matching file for the page will be
223
          silently selected (as if the first was the only match)
224
        - if ``last``, then the last matching file for the page will be
225
          silently selected (as if the last was the only match)
226
        - if ``abort``, then an exception will be raised.
227
        Multiple matches for PAGE-XML will always raise an exception.
228
229
        Args:
230
             require_first (bool): If true, then skip a page entirely
231
             whenever it is not available in the first input fileGrp.
232
233
             mimetype (str): If not None, filter by the specified MIME
234
             type (literal or regex prefixed by ``//``.
235
             Otherwise prefer PAGE or image.
236
        """
237
        if not self.input_file_grp:
238
            raise ValueError("Processor is missing input fileGrp")
239
240
        LOG = getLogger('ocrd.processor.base')
241
        ifgs = self.input_file_grp.split(",")
242
        # Iterating over all files repeatedly may seem inefficient at first sight,
243
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
244
        # can actually be much more costly than traversing the ltree.
245
        # This might depend on the number of pages vs number of fileGrps.
246
247
        pages = dict()
248
        for i, ifg in enumerate(ifgs):
249
            for file_ in sorted(self.workspace.mets.find_all_files(
250
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
251
                                # sort by MIME type so PAGE comes before images
252
                                key=lambda file_: file_.mimetype):
253
                if not file_.pageId:
254
                    continue
255
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
256
                if ift[i]:
257
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
258
                    # fileGrp has multiple files for this page ID
259
                    if mimetype:
260
                        # filter was active, this must not happen
261 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
262
                            ift[i] = None
263
                        elif on_error == 'first':
264
                            pass # keep first match
265
                        elif on_error == 'last':
266
                            ift[i] = file_
267
                        elif on_error == 'abort':
268
                            raise ValueError(
269
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
270
                                    mimetype, file_.pageId, ifg))
271
                        else:
272
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
273
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
274
                          file_.mimetype != MIMETYPE_PAGE):
275
                        pass # keep PAGE match
276
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
277
                          file_.mimetype == MIMETYPE_PAGE):
278
                            raise ValueError(
279
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
280
                                    file_.pageId, ifg))
281
                    else:
282
                        # filter was inactive but no PAGE is in control, this must not happen
283 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
284
                            ift[i] = None
285
                        elif on_error == 'first':
286
                            pass # keep first match
287
                        elif on_error == 'last':
288
                            ift[i] = file_
289
                        elif on_error == 'abort':
290
                            raise ValueError(
291
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
292
                                    file_.pageId, ifg))
293
                        else:
294
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
295
                else:
296
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
297
                    ift[i] = file_
298
        ifts = list()
299
        for page, ifiles in pages.items():
300
            for i, ifg in enumerate(ifgs):
301
                if not ifiles[i]:
302
                    # other fallback options?
303
                    LOG.error('found no page %s in file group %s',
304
                              page, ifg)
305
            if ifiles[0] or not require_first:
306
                ifts.append(tuple(ifiles))
307
        return ifts
308