Passed
Pull Request — master (#559)
by Konstantin
02:18
created

ocrd.processor.base.Processor.input_files()   A

Complexity

Conditions 3

Size

Total Lines 21
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 21
rs 9.95
c 0
b 0
f 0
cc 3
nop 1
1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os import makedirs
13
from os.path import exists, isdir, join
14
from shutil import copyfileobj
15
import json
16
import os
17
import re
18
import sys
19
20
import requests
21
22
from ocrd_utils import (
23
    VERSION as OCRD_VERSION,
24
    MIMETYPE_PAGE,
25
    getLogger,
26
    initLogging,
27
    list_resource_candidates,
28
    list_all_resources,
29
    XDG_CACHE_HOME
30
)
31
from ocrd_validators import ParameterValidator
32
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
33
from ..resource_manager import OcrdResourceManager
34
35
# XXX imports must remain for backwards-compatibilty
36
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
37
38
class Processor():
39
    """
40
    A processor is an OCR-D compliant command-line-interface for executing
41
    a single workflow step on the workspace (represented by local METS). It
42
    reads input files for all or requested physical pages of the input fileGrp(s),
43
    and writes output files for them into the output fileGrp(s). It may take 
44
    a number of optional or mandatory parameters.
45
    """
46
47
    def __init__(
48
            self,
49
            workspace,
50
            ocrd_tool=None,
51
            parameter=None,
52
            # TODO OCR-D/core#274
53
            # input_file_grp=None,
54
            # output_file_grp=None,
55
            input_file_grp="INPUT",
56
            output_file_grp="OUTPUT",
57
            page_id=None,
58
            show_resource=None,
59
            list_resources=False,
60
            show_help=False,
61
            show_version=False,
62
            dump_json=False,
63
            version=None
64
    ):
65
        if parameter is None:
66
            parameter = {}
67
        if dump_json:
68
            print(json.dumps(ocrd_tool, indent=True))
69
            return
70
        if list_resources:
71
            for res in list_all_resources(ocrd_tool['executable']):
72
                print(res)
73
            return
74
        if show_resource:
75
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True)
76
            if not res_fname:
77
                initLogging()
78
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
79
                logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable']))
80
            else:
81
                with open(res_fname[0], 'rb') as f:
82
                    copyfileobj(f, sys.stdout.buffer)
83
            return
84
        self.ocrd_tool = ocrd_tool
85
        if show_help:
86
            self.show_help()
87
            return
88
        self.version = version
89
        if show_version:
90
            self.show_version()
91
            return
92
        self.workspace = workspace
93
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
94
        # but there is no way to do that in process here since it's an
95
        # overridden method. chdir is almost always an anti-pattern.
96
        if self.workspace:
97
            os.chdir(self.workspace.directory)
98
        self.input_file_grp = input_file_grp
99
        self.output_file_grp = output_file_grp
100
        self.page_id = None if page_id == [] or page_id is None else page_id
101
        parameterValidator = ParameterValidator(ocrd_tool)
102
        report = parameterValidator.validate(parameter)
103
        if not report.is_valid:
104
            raise Exception("Invalid parameters %s" % report.errors)
105
        self.parameter = parameter
106
107
    def show_help(self):
108
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
109
110
    def show_version(self):
111
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
112
113
    def verify(self):
114
        """
115
        Verify that the input fulfills the processor's requirements.
116
        """
117
        return True
118
119
    def process(self):
120
        """
121
        Process the workspace
122
        """
123
        raise Exception("Must be implemented")
124
125
126
    def add_metadata(self, pcgts):
127
        """
128
        Adds PAGE-XML MetadataItem describing the processing step
129
        """
130
        pcgts.get_Metadata().add_MetadataItem(
131
                MetadataItemType(type_="processingStep",
132
                    name=self.ocrd_tool['steps'][0],
133
                    value=self.ocrd_tool['executable'],
134
                    Labels=[LabelsType(
135
                        externalModel="ocrd-tool",
136
                        externalId="parameters",
137
                        Label=[LabelType(type_=name,
138
                                         value=self.parameter[name])
139
                               for name in self.parameter.keys()]),
140
                            LabelsType(
141
                        externalModel="ocrd-tool",
142
                        externalId="version",
143
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
144
                                         value=self.version),
145
                               LabelType(type_='ocrd/core',
146
                                         value=OCRD_VERSION)])
147
                    ]))
148
149
    def resolve_resource(self, val):
150
        """
151
        Resolve a resource name to an absolute file path with the algorithm in
152
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
153
154
        Args:
155
            val (string): resource value to resolve
156
        """
157
        executable = self.ocrd_tool['executable']
158
        if exists(val):
159
            return val
160
        ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)]
161
        if ret:
162
            return ret[0]
163
        resmgr = OcrdResourceManager()
164
        reslist = resmgr.find_resources(executable, name=val)
165
        if not reslist:
166
            reslist = resmgr.find_resources(executable, url=val)
167
        if not reslist:
168
            raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val))
169
        _, resdict = reslist[0]
170
        return str(resmgr.download(
171
            executable,
172
            url=resdict['url'],
173
            name=resdict['name'],
174
            path_in_archive=resdict['path_in_archive'],
175
            resource_type=resdict['type']
176
        ))
177
178
    def list_all_resources(self):
179
        """
180
        List all resources found in the filesystem
181
        """
182
        return list_all_resources(self.ocrd_tool['executable'])
183
184
    @property
185
    def input_files(self):
186
        """
187
        List the input files (for single input file groups).
188
189
        For each physical page:
190
        - If there is a single PAGE-XML for the page, take it (and forget about all
191
          other files for that page)
192
        - Else if there is a single image file, take it (and forget about all other
193
          files for that page)
194
        - Otherwise raise an error (complaining that only PAGE-XML warrants
195
          having multiple images for a single page)
196
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
197
        """
198
        if not self.input_file_grp:
199
            raise ValueError("Processor is missing input fileGrp")
200
        ret = self.zip_input_files(mimetype=None, on_error='abort')
201
        if not ret:
202
            return []
203
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
204
        return [tuples[0] for tuples in ret]
205
206
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
207
        """
208
        List tuples of input files (for multiple input file groups).
209
210
        Processors that expect/need multiple input file groups,
211
        cannot use ``input_files``. They must align (zip) input files
212
        across pages. This includes the case where not all pages
213
        are equally present in all file groups. It also requires
214
        making a consistent selection if there are multiple files
215
        per page.
216
217
        Following the OCR-D functional model, this function tries to
218
        find a single PAGE file per page, or fall back to a single
219
        image file per page. In either case, multiple matches per page
220
        are an error (see error handling below).
221
        This default behaviour can be changed by using a fixed MIME
222
        type filter via ``mimetype``. But still, multiple matching
223
        files per page are an error.
224
225
        Single-page multiple-file errors are handled according to
226
        ``on_error``:
227
        - if ``skip``, then the page for the respective fileGrp will be
228
          silently skipped (as if there was no match at all)
229
        - if ``first``, then the first matching file for the page will be
230
          silently selected (as if the first was the only match)
231
        - if ``last``, then the last matching file for the page will be
232
          silently selected (as if the last was the only match)
233
        - if ``abort``, then an exception will be raised.
234
        Multiple matches for PAGE-XML will always raise an exception.
235
236
        Args:
237
             require_first (bool): If true, then skip a page entirely
238
             whenever it is not available in the first input fileGrp.
239
240
             mimetype (str): If not None, filter by the specified MIME
241
             type (literal or regex prefixed by ``//``.
242
             Otherwise prefer PAGE or image.
243
        """
244
        if not self.input_file_grp:
245
            raise ValueError("Processor is missing input fileGrp")
246
247
        LOG = getLogger('ocrd.processor.base')
248
        ifgs = self.input_file_grp.split(",")
249
        # Iterating over all files repeatedly may seem inefficient at first sight,
250
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
251
        # can actually be much more costly than traversing the ltree.
252
        # This might depend on the number of pages vs number of fileGrps.
253
254
        pages = dict()
255
        for i, ifg in enumerate(ifgs):
256
            for file_ in sorted(self.workspace.mets.find_all_files(
257
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
258
                                # sort by MIME type so PAGE comes before images
259
                                key=lambda file_: file_.mimetype):
260
                if not file_.pageId:
261
                    continue
262
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
263
                if ift[i]:
264
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
265
                    # fileGrp has multiple files for this page ID
266
                    if mimetype:
267
                        # filter was active, this must not happen
268 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
269
                            ift[i] = None
270
                        elif on_error == 'first':
271
                            pass # keep first match
272
                        elif on_error == 'last':
273
                            ift[i] = file_
274
                        elif on_error == 'abort':
275
                            raise ValueError(
276
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
277
                                    mimetype, file_.pageId, ifg))
278
                        else:
279
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
280
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
281
                          file_.mimetype != MIMETYPE_PAGE):
282
                        pass # keep PAGE match
283
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
284
                          file_.mimetype == MIMETYPE_PAGE):
285
                            raise ValueError(
286
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
287
                                    file_.pageId, ifg))
288
                    else:
289
                        # filter was inactive but no PAGE is in control, this must not happen
290 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
291
                            ift[i] = None
292
                        elif on_error == 'first':
293
                            pass # keep first match
294
                        elif on_error == 'last':
295
                            ift[i] = file_
296
                        elif on_error == 'abort':
297
                            raise ValueError(
298
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
299
                                    file_.pageId, ifg))
300
                        else:
301
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
302
                else:
303
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
304
                    ift[i] = file_
305
        ifts = list()
306
        for page, ifiles in pages.items():
307
            for i, ifg in enumerate(ifgs):
308
                if not ifiles[i]:
309
                    # other fallback options?
310
                    LOG.error('found no page %s in file group %s',
311
                              page, ifg)
312
            if ifiles[0] or not require_first:
313
                ifts.append(tuple(ifiles))
314
        return ifts
315