Passed
Pull Request — master (#559)
by Konstantin
02:54
created

ocrd.processor.base.Processor.show_help()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os import makedirs
13
from os.path import exists, isdir, join
14
from shutil import copyfileobj
15
import json
16
import os
17
import re
18
from pkg_resources import resource_filename
19
20
import requests
21
22
from ocrd_utils import (
23
    VERSION as OCRD_VERSION,
24
    MIMETYPE_PAGE,
25
    list_resource_candidates,
26
    list_all_resources,
27
    XDG_CACHE_HOME
28
)
29
from ocrd_validators import ParameterValidator
30
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
31
32
# XXX imports must remain for backwards-compatibilty
33
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
34
35
class Processor():
36
    """
37
    A processor is an OCR-D compliant command-line-interface for executing
38
    a single workflow step on the workspace (represented by local METS). It
39
    reads input files for all or requested physical pages of the input fileGrp(s),
40
    and writes output files for them into the output fileGrp(s). It may take 
41
    a number of optional or mandatory parameters.
42
    """
43
44
    def __init__(
45
            self,
46
            workspace,
47
            ocrd_tool=None,
48
            parameter=None,
49
            # TODO OCR-D/core#274
50
            # input_file_grp=None,
51
            # output_file_grp=None,
52
            input_file_grp="INPUT",
53
            output_file_grp="OUTPUT",
54
            page_id=None,
55
            show_help=False,
56
            show_version=False,
57
            dump_json=False,
58
            version=None
59
    ):
60
        if parameter is None:
61
            parameter = {}
62
        if dump_json:
63
            print(json.dumps(ocrd_tool, indent=True))
64
            return
65
        self.ocrd_tool = ocrd_tool
66
        if show_help:
67
            self.show_help()
68
            return
69
        self.version = version
70
        if show_version:
71
            self.show_version()
72
            return
73
        self.workspace = workspace
74
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
75
        # but there is no way to do that in process here since it's an
76
        # overridden method. chdir is almost always an anti-pattern.
77
        if self.workspace:
78
            os.chdir(self.workspace.directory)
79
        self.input_file_grp = input_file_grp
80
        self.output_file_grp = output_file_grp
81
        self.page_id = None if page_id == [] or page_id is None else page_id
82
        parameterValidator = ParameterValidator(ocrd_tool)
83
        report = parameterValidator.validate(parameter)
84
        if not report.is_valid:
85
            raise Exception("Invalid parameters %s" % report.errors)
86
        self.parameter = parameter
87
88
    def show_help(self):
89
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
90
91
    def show_version(self):
92
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
93
94
    def verify(self):
95
        """
96
        Verify that the input fulfills the processor's requirements.
97
        """
98
        return True
99
100
    def process(self):
101
        """
102
        Process the workspace
103
        """
104
        raise Exception("Must be implemented")
105
106
107
    def add_metadata(self, pcgts):
108
        """
109
        Adds PAGE-XML MetadataItem describing the processing step
110
        """
111
        pcgts.get_Metadata().add_MetadataItem(
112
                MetadataItemType(type_="processingStep",
113
                    name=self.ocrd_tool['steps'][0],
114
                    value=self.ocrd_tool['executable'],
115
                    Labels=[LabelsType(
116
                        externalModel="ocrd-tool",
117
                        externalId="parameters",
118
                        Label=[LabelType(type_=name,
119
                                         value=self.parameter[name])
120
                               for name in self.parameter.keys()]),
121
                            LabelsType(
122
                        externalModel="ocrd-tool",
123
                        externalId="version",
124
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
125
                                         value=self.version),
126
                               LabelType(type_='ocrd/core',
127
                                         value=OCRD_VERSION)])
128
                    ]))
129
130
    def resolve_resource(self, parameter_name, val):
131
        """
132
        Resolve a resource name with the algorithm in
133
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
134
135
        Args:
136
            parameter_name (string): name of parameter to resolve resource for
137
            val (string): resource value to resolve
138
        """
139
        executable = self.ocrd_tool['executable']
140
        try:
141
            param = self.ocrd_tool['parameter'][parameter_name]
142
        except KeyError:
143
            raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
144
        if not param['mimetype']:
145
            raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
146
                             parameter_name)
147
        if val.startswith('http:') or val.startswith('https:'):
148
            cache_dir = join(XDG_CACHE_HOME, executable)
149
            cache_key = re.sub('[^A-Za-z0-9]', '', val)
150
            cache_fpath = join(cache_dir, cache_key)
151
            # TODO Proper caching (make head request for size, If-Modified etc)
152
            if not exists(cache_fpath):
153
                if not isdir(cache_dir):
154
                    makedirs(cache_dir)
155
                with requests.get(val, stream=True) as r:
156
                    with open(cache_fpath, 'wb') as f:
157
                        copyfileobj(r.raw, f)
158
            return cache_fpath
159
        ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
160
        if ret:
161
            return ret
162
        bundled_fpath = resource_filename(__name__, val)
163
        if exists(bundled_fpath):
164
            return bundled_fpath
165
        raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
166
                                (parameter_name, val))
167
168
    def list_all_resources(self):
169
        """
170
        List all resources found in the filesystem
171
        """
172
        return list_all_resources(self.ocrd_tool['executable'])
173
174
    @property
175
    def input_files(self):
176
        """
177
        List the input files (for single input file groups).
178
179
        For each physical page:
180
        - If there is a single PAGE-XML for the page, take it (and forget about all
181
          other files for that page)
182
        - Else if there is a single image file, take it (and forget about all other
183
          files for that page)
184
        - Otherwise raise an error (complaining that only PAGE-XML warrants
185
          having multiple images for a single page)
186
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
187
        """
188
        if not self.input_file_grp:
189
            raise ValueError("Processor is missing input fileGrp")
190
        ret = self.zip_input_files(mimetype=None, on_error='abort')
191
        if not ret:
192
            return []
193
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
194
        return [tuples[0] for tuples in ret]
195
196
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
197
        """
198
        List tuples of input files (for multiple input file groups).
199
200
        Processors that expect/need multiple input file groups,
201
        cannot use ``input_files``. They must align (zip) input files
202
        across pages. This includes the case where not all pages
203
        are equally present in all file groups. It also requires
204
        making a consistent selection if there are multiple files
205
        per page.
206
207
        Following the OCR-D functional model, this function tries to
208
        find a single PAGE file per page, or fall back to a single
209
        image file per page. In either case, multiple matches per page
210
        are an error (see error handling below).
211
        This default behaviour can be changed by using a fixed MIME
212
        type filter via ``mimetype``. But still, multiple matching
213
        files per page are an error.
214
215
        Single-page multiple-file errors are handled according to
216
        ``on_error``:
217
        - if ``skip``, then the page for the respective fileGrp will be
218
          silently skipped (as if there was no match at all)
219
        - if ``first``, then the first matching file for the page will be
220
          silently selected (as if the first was the only match)
221
        - if ``last``, then the last matching file for the page will be
222
          silently selected (as if the last was the only match)
223
        - if ``abort``, then an exception will be raised.
224
        Multiple matches for PAGE-XML will always raise an exception.
225
226
        Args:
227
             require_first (bool): If true, then skip a page entirely
228
             whenever it is not available in the first input fileGrp.
229
230
             mimetype (str): If not None, filter by the specified MIME
231
             type (literal or regex prefixed by ``//``.
232
             Otherwise prefer PAGE or image.
233
        """
234
        if not self.input_file_grp:
235
            raise ValueError("Processor is missing input fileGrp")
236
237
        LOG = getLogger('ocrd.processor.base')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable getLogger does not seem to be defined.
Loading history...
238
        ifgs = self.input_file_grp.split(",")
239
        # Iterating over all files repeatedly may seem inefficient at first sight,
240
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
241
        # can actually be much more costly than traversing the ltree.
242
        # This might depend on the number of pages vs number of fileGrps.
243
244
        pages = dict()
245
        for i, ifg in enumerate(ifgs):
246
            for file_ in sorted(self.workspace.mets.find_all_files(
247
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
248
                                # sort by MIME type so PAGE comes before images
249
                                key=lambda file_: file_.mimetype):
250
                if not file_.pageId:
251
                    continue
252
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
253
                if ift[i]:
254
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
255
                    # fileGrp has multiple files for this page ID
256
                    if mimetype:
257
                        # filter was active, this must not happen
258 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
259
                            ift[i] = None
260
                        elif on_error == 'first':
261
                            pass # keep first match
262
                        elif on_error == 'last':
263
                            ift[i] = file_
264
                        elif on_error == 'abort':
265
                            raise ValueError(
266
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
267
                                    mimetype, file_.pageId, ifg))
268
                        else:
269
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
270
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
271
                          file_.mimetype != MIMETYPE_PAGE):
272
                        pass # keep PAGE match
273
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
274
                          file_.mimetype == MIMETYPE_PAGE):
275
                            raise ValueError(
276
                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
277
                                    file_.pageId, ifg))
278
                    else:
279
                        # filter was inactive but no PAGE is in control, this must not happen
280 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
281
                            ift[i] = None
282
                        elif on_error == 'first':
283
                            pass # keep first match
284
                        elif on_error == 'last':
285
                            ift[i] = file_
286
                        elif on_error == 'abort':
287
                            raise ValueError(
288
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
289
                                    file_.pageId, ifg))
290
                        else:
291
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
292
                else:
293
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
294
                    ift[i] = file_
295
        ifts = list()
296
        for page, ifiles in pages.items():
297
            for i, ifg in enumerate(ifgs):
298
                if not ifiles[i]:
299
                    # other fallback options?
300
                    LOG.error('found no page %s in file group %s',
301
                              page, ifg)
302
            if ifiles[0] or not require_first:
303
                ifts.append(tuple(ifiles))
304
        return ifts
305