Passed
Pull Request — master (#799)
by Konstantin
02:07
created

ocrd.processor.base.Processor.resolve_resource()   B

Complexity

Conditions 6

Size

Total Lines 38
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 28
dl 0
loc 38
rs 8.2746
c 0
b 0
f 0
cc 6
nop 2
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
import sys
19
import tarfile
20
import io
21
22
from ocrd_utils import (
23
    VERSION as OCRD_VERSION,
24
    MIMETYPE_PAGE,
25
    getLogger,
26
    initLogging,
27
    list_resource_candidates,
28
    nth_url_segment,
29
    pushd_popd,
30
    list_all_resources,
31
    get_processor_resource_types
32
)
33
from ocrd_validators import ParameterValidator
34
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
35
from ocrd.resource_manager import OcrdResourceManager
36
37
# XXX imports must remain for backwards-compatibilty
38
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
39
40
class Processor():
41
    """
42
    A processor is a tool that implements the uniform OCR-D command-line interface
43
    for run-time data processing. That is, it executes a single workflow step,
44
    or a combination of workflow steps, on the workspace (represented by local METS).
45
    It reads input files for all or requested physical pages of the input fileGrp(s),
46
    and writes output files for them into the output fileGrp(s). It may take 
47
    a number of optional or mandatory parameters.
48
    """
49
50
    def __init__(
51
            self,
52
            workspace,
53
            ocrd_tool=None,
54
            parameter=None,
55
            # TODO OCR-D/core#274
56
            # input_file_grp=None,
57
            # output_file_grp=None,
58
            input_file_grp="INPUT",
59
            output_file_grp="OUTPUT",
60
            page_id=None,
61
            show_resource=None,
62
            list_resources=False,
63
            show_help=False,
64
            show_version=False,
65
            dump_json=False,
66
            version=None
67
    ):
68
        """
69
        Instantiate, but do not process. Unless ``list_resources`` or
70
        ``show_resource`` or ``show_help`` or ``show_version`` or
71
        ``dump_json`` is true, setup for processing (parsing and
72
        validating parameters, entering the workspace directory).
73
74
        Args:
75
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
76
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
77
                 but then needs to be set before running.
78
        Keyword Args:
79
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
80
                 Can be ``None`` for processing, but needs to be set before running.
81
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
82
                 Can be ``None`` even for processing, but then needs to be set before running.
83
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
84
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
85
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
86
                 (or empty for all pages).
87
             show_resource (string): If not ``None``, then instead of processing, resolve \
88
                 given resource by name and print its contents to stdout.
89
             list_resources (boolean): If true, then instead of processing, find all installed \
90
                 resource files in the search paths and print their path names.
91
             show_help (boolean): If true, then instead of processing, print a usage description \
92
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
93
                 docstrings.
94
             show_version (boolean): If true, then instead of processing, print information on \
95
                 this processor's version and OCR-D version. Exit afterwards.
96
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
97
                 on stdout.
98
        """
99
        if parameter is None:
100
            parameter = {}
101
        if dump_json:
102
            print(json.dumps(ocrd_tool, indent=True))
103
            return
104
        if list_resources:
105
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
106
            for res in list_all_resources(ocrd_tool['executable']):
107
                if Path(res).is_dir() and not has_dirs:
108
                    continue
109
                if not Path(res).is_dir() and not has_files:
110
                    continue
111
                print(res)
112
            return
113
        if show_resource:
114
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
115
            res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource)
116
            if not res_fname:
117
                initLogging()
118
                logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable'])
119
                logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable']))
120
            else:
121
                fpath = Path(res_fname[0])
122
                if fpath.is_dir():
123
                    with pushd_popd(fpath):
124
                        fileobj = io.BytesIO()
125
                        with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
126
                            tarball.add('.')
127
                        fileobj.seek(0)
128
                        copyfileobj(fileobj, sys.stdout.buffer)
129
                else:
130
                    sys.stdout.buffer.write(fpath.read_bytes())
131
            return
132
        self.ocrd_tool = ocrd_tool
133
        if show_help:
134
            self.show_help()
135
            return
136
        self.version = version
137
        if show_version:
138
            self.show_version()
139
            return
140
        self.workspace = workspace
141
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
142
        # but there is no way to do that in process here since it's an
143
        # overridden method. chdir is almost always an anti-pattern.
144
        if self.workspace:
145
            self.old_pwd = getcwd()
146
            os.chdir(self.workspace.directory)
147
        self.input_file_grp = input_file_grp
148
        self.output_file_grp = output_file_grp
149
        self.page_id = None if page_id == [] or page_id is None else page_id
150
        parameterValidator = ParameterValidator(ocrd_tool)
151
        report = parameterValidator.validate(parameter)
152
        if not report.is_valid:
153
            raise Exception("Invalid parameters %s" % report.errors)
154
        self.parameter = parameter
155
156
    def show_help(self):
157
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
158
159
    def show_version(self):
160
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
161
162
    def verify(self):
163
        """
164
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
165
        """
166
        return True
167
168
    def process(self):
169
        """
170
        Process the :py:attr:`workspace` 
171
        from the given :py:attr:`input_file_grp`
172
        to the given :py:attr:`output_file_grp`
173
        for the given :py:attr:`page_id`
174
        under the given :py:attr:`parameter`.
175
        
176
        (This contains the main functionality and needs to be overridden by subclasses.)
177
        """
178
        raise Exception("Must be implemented")
179
180
181
    def add_metadata(self, pcgts):
182
        """
183
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
184
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
185
        """
186
        pcgts.get_Metadata().add_MetadataItem(
187
                MetadataItemType(type_="processingStep",
188
                    name=self.ocrd_tool['steps'][0],
189
                    value=self.ocrd_tool['executable'],
190
                    Labels=[LabelsType(
191
                        externalModel="ocrd-tool",
192
                        externalId="parameters",
193
                        Label=[LabelType(type_=name,
194
                                         value=self.parameter[name])
195
                               for name in self.parameter.keys()]),
196
                            LabelsType(
197
                        externalModel="ocrd-tool",
198
                        externalId="version",
199
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
200
                                         value=self.version),
201
                               LabelType(type_='ocrd/core',
202
                                         value=OCRD_VERSION)])
203
                    ]))
204
205
    def resolve_resource(self, val):
206
        """
207
        Resolve a resource name to an absolute file path with the algorithm in
208
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
209
210
        Args:
211
            val (string): resource value to resolve
212
        """
213
        executable = self.ocrd_tool['executable']
214
        log = getLogger('ocrd.%s.resolve_resource' % executable)
215
        if exists(val):
216
            log.debug("Resolved to absolute path %s" % val)
217
            return val
218
        ret = [cand for cand in list_resource_candidates(executable, val, cwd=self.old_pwd) if exists(cand)]
219
        if ret:
220
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
221
            return ret[0]
222
        elif (val.startswith('http://') or val.startswith('https://')):
223
            resmgr = OcrdResourceManager()
224
            reslist = resmgr.find_resources(executable, url=val)
225
            if reslist:
226
                _, resdict = reslist[0]
227
                log.info("Found registered resource for %s: '%s' (%s)." % (executable, val, resdict))
228
            else:
229
                resdict = {}
230
                log.info("Not a registered resource for %s: '%s'." % (executable, val))
231
            return str(resmgr.download(
232
                executable,
233
                val,
234
                basedir = resmgr.location_to_resource_dir('data'),
235
                name=resdict.get('name', nth_url_segment(val)),
236
                path_in_archive=resdict.get('path_in_archive', '.'),
237
                resource_type=resdict.get('type', 'file')
238
                ))
239
        else:
240
            log.error("Could not find resource '%s' for executable '%s'. Try 'ocrd resmgr download %s %s' to download this resource or use a URL for the parameter value.",
241
                val, executable, executable, val)
242
            sys.exit(1)
243
244
    def list_all_resources(self):
245
        """
246
        List all resources found in the filesystem
247
        """
248
        return list_all_resources(self.ocrd_tool['executable'])
249
250
    @property
251
    def input_files(self):
252
        """
253
        List the input files (for single-valued :py:attr:`input_file_grp`).
254
255
        For each physical page:
256
257
        - If there is a single PAGE-XML for the page, take it (and forget about all
258
          other files for that page)
259
        - Else if there is a single image file, take it (and forget about all other
260
          files for that page)
261
        - Otherwise raise an error (complaining that only PAGE-XML warrants
262
          having multiple images for a single page)
263
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
264
        
265
        Returns:
266
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
267
        """
268
        if not self.input_file_grp:
269
            raise ValueError("Processor is missing input fileGrp")
270
        ret = self.zip_input_files(mimetype=None, on_error='abort')
271
        if not ret:
272
            return []
273
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
274
        return [tuples[0] for tuples in ret]
275
276
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
277
        """
278
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
279
280
        Processors that expect/need multiple input file groups,
281
        cannot use :py:data:`input_files`. They must align (zip) input files
282
        across pages. This includes the case where not all pages
283
        are equally present in all file groups. It also requires
284
        making a consistent selection if there are multiple files
285
        per page.
286
287
        Following the OCR-D functional model, this function tries to
288
        find a single PAGE file per page, or fall back to a single
289
        image file per page. In either case, multiple matches per page
290
        are an error (see error handling below).
291
        This default behaviour can be changed by using a fixed MIME
292
        type filter via :py:attr:`mimetype`. But still, multiple matching
293
        files per page are an error.
294
295
        Single-page multiple-file errors are handled according to
296
        :py:attr:`on_error`:
297
298
        - if ``skip``, then the page for the respective fileGrp will be
299
          silently skipped (as if there was no match at all)
300
        - if ``first``, then the first matching file for the page will be
301
          silently selected (as if the first was the only match)
302
        - if ``last``, then the last matching file for the page will be
303
          silently selected (as if the last was the only match)
304
        - if ``abort``, then an exception will be raised.
305
        Multiple matches for PAGE-XML will always raise an exception.
306
307
        Keyword Args:
308
             require_first (boolean): If true, then skip a page entirely
309
                 whenever it is not available in the first input `fileGrp`.
310
             mimetype (string): If not `None`, filter by the specified MIME
311
                 type (literal or regex prefixed by `//`). Otherwise prefer
312
                 PAGE or image.
313
        Returns:
314
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
315
        """
316
        if not self.input_file_grp:
317
            raise ValueError("Processor is missing input fileGrp")
318
319
        LOG = getLogger('ocrd.processor.base')
320
        ifgs = self.input_file_grp.split(",")
321
        # Iterating over all files repeatedly may seem inefficient at first sight,
322
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
323
        # can actually be much more costly than traversing the ltree.
324
        # This might depend on the number of pages vs number of fileGrps.
325
326
        pages = dict()
327
        for i, ifg in enumerate(ifgs):
328
            for file_ in sorted(self.workspace.mets.find_all_files(
329
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
330
                                # sort by MIME type so PAGE comes before images
331
                                key=lambda file_: file_.mimetype):
332
                if not file_.pageId:
333
                    continue
334
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
335
                if ift[i]:
336
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
337
                    # fileGrp has multiple files for this page ID
338
                    if mimetype:
339
                        # filter was active, this must not happen
340 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
341
                            ift[i] = None
342
                        elif on_error == 'first':
343
                            pass # keep first match
344
                        elif on_error == 'last':
345
                            ift[i] = file_
346
                        elif on_error == 'abort':
347
                            raise ValueError(
348
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
349
                                    mimetype, file_.pageId, ifg))
350
                        else:
351
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
352
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
353
                          file_.mimetype != MIMETYPE_PAGE):
354
                        pass # keep PAGE match
355
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
356
                          file_.mimetype == MIMETYPE_PAGE):
357
                        raise ValueError(
358
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
359
                                file_.pageId, ifg))
360
                    else:
361
                        # filter was inactive but no PAGE is in control, this must not happen
362 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
363
                            ift[i] = None
364
                        elif on_error == 'first':
365
                            pass # keep first match
366
                        elif on_error == 'last':
367
                            ift[i] = file_
368
                        elif on_error == 'abort':
369
                            raise ValueError(
370
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
371
                                    file_.pageId, ifg))
372
                        else:
373
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
374
                else:
375
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
376
                    ift[i] = file_
377
        ifts = list()
378
        for page, ifiles in pages.items():
379
            for i, ifg in enumerate(ifgs):
380
                if not ifiles[i]:
381
                    # other fallback options?
382
                    LOG.error('found no page %s in file group %s',
383
                              page, ifg)
384
            if ifiles[0] or not require_first:
385
                ifts.append(tuple(ifiles))
386
        return ifts
387