Passed
Pull Request — master (#1256)
by
unknown
03:01
created

ocrd.processor.base.Processor.show_resource()   A

Complexity

Conditions 4

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
cc 4
nop 2
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
import sys
19
import tarfile
20
import io
21
from ocrd.workspace import Workspace
22
23
from ocrd_utils import (
24
    VERSION as OCRD_VERSION,
25
    MIMETYPE_PAGE,
26
    MIME_TO_EXT,
27
    getLogger,
28
    initLogging,
29
    list_resource_candidates,
30
    pushd_popd,
31
    list_all_resources,
32
    get_processor_resource_types,
33
    resource_filename,
34
)
35
from ocrd_validators import ParameterValidator
36
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
37
38
# XXX imports must remain for backwards-compatibility
39
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
40
41
class ResourceNotFoundError(FileNotFoundError):
42
    """
43
    An exception signifying the requested processor resource
44
    cannot be resolved.
45
    """
46
    def __init__(self, name, executable):
47
        self.name = name
48
        self.executable = executable
49
        self.message = "Could not find resource '%s' for executable '%s'. " \
50
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
51
                       % (name, executable, executable, name)
52
        super().__init__(self.message)
53
54
class Processor():
55
    """
56
    A processor is a tool that implements the uniform OCR-D command-line interface
57
    for run-time data processing. That is, it executes a single workflow step,
58
    or a combination of workflow steps, on the workspace (represented by local METS).
59
    It reads input files for all or requested physical pages of the input fileGrp(s),
60
    and writes output files for them into the output fileGrp(s). It may take 
61
    a number of optional or mandatory parameters.
62
    """
63
64
    def __init__(
65
            self,
66
            workspace : Workspace,
67
            ocrd_tool=None,
68
            parameter=None,
69
            input_file_grp=None,
70
            output_file_grp=None,
71
            page_id=None,
72
            resolve_resource=None,
73
            show_resource=None,
74
            list_resources=False,
75
            show_help=False,
76
            subcommand=None,
77
            show_version=False,
78
            dump_json=False,
79
            dump_module_dir=False,
80
            version=None
81
    ):
82
        """
83
        Instantiate, but do not process. Unless ``list_resources`` or
84
        ``show_resource`` or ``show_help`` or ``show_version`` or
85
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
86
        (parsing and validating parameters, entering the workspace directory).
87
88
        Args:
89
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
90
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
91
                 but then needs to be set before running.
92
        Keyword Args:
93
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
94
                 Can be ``None`` for processing, but needs to be set before running.
95
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
96
                 Can be ``None`` even for processing, but then needs to be set before running.
97
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
98
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
99
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
100
                 (or empty for all pages).
101
             resolve_resource (string): If not ``None``, then instead of processing, resolve \
102
                 given resource by name and print its full path to stdout.
103
             show_resource (string): If not ``None``, then instead of processing, resolve \
104
                 given resource by name and print its contents to stdout.
105
             list_resources (boolean): If true, then instead of processing, find all installed \
106
                 resource files in the search paths and print their path names.
107
             show_help (boolean): If true, then instead of processing, print a usage description \
108
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
109
                 docstrings.
110
             subcommand (string): 'worker' or 'server', only used here for the right --help output
111
             show_version (boolean): If true, then instead of processing, print information on \
112
                 this processor's version and OCR-D version. Exit afterwards.
113
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
114
                 on stdout.
115
             dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
116
                 on stdout.
117
        """
118
        self.ocrd_tool = ocrd_tool
119
        if dump_json:
120
            print(json.dumps(ocrd_tool, indent=True))
121
            return
122
        if dump_module_dir:
123
            print(self.moduledir)
124
            return
125
        if list_resources:
126
            for res in self.list_all_resources():
127
                print(res)
128
            return
129
        if resolve_resource:
130
            try:
131
                res = self.resolve_resource(resolve_resource)
132
                print(res)
133
            except ResourceNotFoundError as e:
134
                log = getLogger('ocrd.processor.base')
135
                log.critical(e.message)
136
                sys.exit(1)
137
            return
138
        if show_resource:
139
            try:
140
                self.show_resource(show_resource)
141
            except ResourceNotFoundError as e:
142
                log = getLogger('ocrd.processor.base')
143
                log.critical(e.message)
144
                sys.exit(1)
145
            return
146
        if show_help:
147
            self.show_help(subcommand=subcommand)
148
            return
149
        self.version = version
150
        if show_version:
151
            self.show_version()
152
            return
153
        self.workspace = workspace
154
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
155
        # but there is no way to do that in process here since it's an
156
        # overridden method. chdir is almost always an anti-pattern.
157
        if self.workspace:
158
            self.old_pwd = getcwd()
159
            os.chdir(self.workspace.directory)
160
        self.input_file_grp = input_file_grp
161
        self.output_file_grp = output_file_grp
162
        self.page_id = None if page_id == [] or page_id is None else page_id
163
        if parameter is None:
164
            parameter = {}
165
        parameterValidator = ParameterValidator(ocrd_tool)
166
        report = parameterValidator.validate(parameter)
167
        if not report.is_valid:
168
            raise Exception("Invalid parameters %s" % report.errors)
169
        self.parameter = parameter
170
171
    def show_help(self, subcommand=None):
172
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
173
174
    def show_version(self):
175
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
176
177
    def verify(self):
178
        """
179
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
180
        """
181
        return True
182
183
    def process(self) -> None:
184
        """
185
        Process the :py:attr:`workspace` 
186
        from the given :py:attr:`input_file_grp`
187
        to the given :py:attr:`output_file_grp`
188
        for the given :py:attr:`page_id`
189
        under the given :py:attr:`parameter`.
190
        
191
        (This contains the main functionality and needs to be overridden by subclasses.)
192
        """
193
        raise NotImplementedError()
194
195
196
    def add_metadata(self, pcgts):
197
        """
198
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
199
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
200
        """
201
        pcgts.get_Metadata().add_MetadataItem(
202
                MetadataItemType(type_="processingStep",
203
                    name=self.ocrd_tool['steps'][0],
204
                    value=self.ocrd_tool['executable'],
205
                    Labels=[LabelsType(
206
                        externalModel="ocrd-tool",
207
                        externalId="parameters",
208
                        Label=[LabelType(type_=name,
209
                                         value=self.parameter[name])
210
                               for name in self.parameter.keys()]),
211
                            LabelsType(
212
                        externalModel="ocrd-tool",
213
                        externalId="version",
214
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
215
                                         value=self.version),
216
                               LabelType(type_='ocrd/core',
217
                                         value=OCRD_VERSION)])
218
                    ]))
219
220
    def resolve_resource(self, val):
221
        """
222
        Resolve a resource name to an absolute file path with the algorithm in
223
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
224
225
        Args:
226
            val (string): resource value to resolve
227
        """
228
        initLogging()
229
        executable = self.ocrd_tool['executable']
230
        log = getLogger('ocrd.processor.base')
231
        if exists(val):
232
            log.debug("Resolved to absolute path %s" % val)
233
            return val
234
        if hasattr(self, 'old_pwd'):
235
            cwd = self.old_pwd
236
        else:
237
            cwd = getcwd()
238
        ret = [cand for cand in list_resource_candidates(executable, val,
239
                                                         cwd=cwd, moduled=self.moduledir)
240
               if exists(cand)]
241
        if ret:
242
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
243
            return ret[0]
244
        raise ResourceNotFoundError(val, executable)
245
246
    def show_resource(self, val):
247
        res_fname = self.resolve_resource(val)
248
        fpath = Path(res_fname)
249
        if fpath.is_dir():
250
            with pushd_popd(fpath):
251
                fileobj = io.BytesIO()
252
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
253
                    tarball.add('.')
254
                fileobj.seek(0)
255
                copyfileobj(fileobj, sys.stdout.buffer)
256
        else:
257
            sys.stdout.buffer.write(fpath.read_bytes())
258
259
    def list_all_resources(self):
260
        """
261
        List all resources found in the filesystem and matching content-type by filename suffix
262
        """
263
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
264
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
265
            res = Path(res)
266
            if not '*/*' in mimetypes:
267
                if res.is_dir() and not 'text/directory' in mimetypes:
268
                    continue
269
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
270
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
271
                                             for mime in mimetypes):
272
                    continue
273
            yield res
274
275
    @property
276
    def module(self):
277
        """
278
        The top-level module this processor belongs to.
279
        """
280
        # find shortest prefix path that is not just a namespace package
281
        fqname = ''
282
        for name in self.__module__.split('.'):
283
            if fqname:
284
                fqname += '.'
285
            fqname += name
286
            if getattr(sys.modules[fqname], '__file__', None):
287
                return fqname
288
        # fall-back
289
        return self.__module__
290
291
    @property
292
    def moduledir(self):
293
        """
294
        The filesystem path of the module directory.
295
        """
296
        return resource_filename(self.module, '.')
297
298
    @property
299
    def input_files(self):
300
        """
301
        List the input files (for single-valued :py:attr:`input_file_grp`).
302
303
        For each physical page:
304
305
        - If there is a single PAGE-XML for the page, take it (and forget about all
306
          other files for that page)
307
        - Else if there is a single image file, take it (and forget about all other
308
          files for that page)
309
        - Otherwise raise an error (complaining that only PAGE-XML warrants
310
          having multiple images for a single page)
311
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
312
        
313
        Returns:
314
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
315
        """
316
        if not self.input_file_grp:
317
            raise ValueError("Processor is missing input fileGrp")
318
        ret = self.zip_input_files(mimetype=None, on_error='abort')
319
        if not ret:
320
            return []
321
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
322
        return [tuples[0] for tuples in ret]
323
324
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
325
        """
326
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
327
328
        Processors that expect/need multiple input file groups,
329
        cannot use :py:data:`input_files`. They must align (zip) input files
330
        across pages. This includes the case where not all pages
331
        are equally present in all file groups. It also requires
332
        making a consistent selection if there are multiple files
333
        per page.
334
335
        Following the OCR-D functional model, this function tries to
336
        find a single PAGE file per page, or fall back to a single
337
        image file per page. In either case, multiple matches per page
338
        are an error (see error handling below).
339
        This default behaviour can be changed by using a fixed MIME
340
        type filter via :py:attr:`mimetype`. But still, multiple matching
341
        files per page are an error.
342
343
        Single-page multiple-file errors are handled according to
344
        :py:attr:`on_error`:
345
346
        - if ``skip``, then the page for the respective fileGrp will be
347
          silently skipped (as if there was no match at all)
348
        - if ``first``, then the first matching file for the page will be
349
          silently selected (as if the first was the only match)
350
        - if ``last``, then the last matching file for the page will be
351
          silently selected (as if the last was the only match)
352
        - if ``abort``, then an exception will be raised.
353
        Multiple matches for PAGE-XML will always raise an exception.
354
355
        Keyword Args:
356
             require_first (boolean): If true, then skip a page entirely
357
                 whenever it is not available in the first input `fileGrp`.
358
             mimetype (string): If not `None`, filter by the specified MIME
359
                 type (literal or regex prefixed by `//`). Otherwise prefer
360
                 PAGE or image.
361
        Returns:
362
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
363
        """
364
        if not self.input_file_grp:
365
            raise ValueError("Processor is missing input fileGrp")
366
367
        LOG = getLogger('ocrd.processor.base')
368
        ifgs = self.input_file_grp.split(",")
369
        # Iterating over all files repeatedly may seem inefficient at first sight,
370
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
371
        # can actually be much more costly than traversing the ltree.
372
        # This might depend on the number of pages vs number of fileGrps.
373
374
        pages = dict()
375
        for i, ifg in enumerate(ifgs):
376
            files_ = sorted(self.workspace.mets.find_all_files(
377
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
378
                                # sort by MIME type so PAGE comes before images
379
                                key=lambda file_: file_.mimetype)
380
            # Warn if no files found but pageId was specified because that
381
            # might be because of invalid page_id (range)
382
            if self.page_id and not files_:
383
                msg = (f"Could not find any files for --page-id {self.page_id} - "
384
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
385
                if on_error == 'abort':
386
                    raise ValueError(msg)
387
                LOG.warning(msg)
388
            for file_ in files_:
389
                if not file_.pageId:
390
                    continue
391
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
392
                if ift[i]:
393
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
394
                    # fileGrp has multiple files for this page ID
395
                    if mimetype:
396
                        # filter was active, this must not happen
397 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
398
                            ift[i] = None
399
                        elif on_error == 'first':
400
                            pass # keep first match
401
                        elif on_error == 'last':
402
                            ift[i] = file_
403
                        elif on_error == 'abort':
404
                            raise ValueError(
405
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
406
                                    mimetype, file_.pageId, ifg))
407
                        else:
408
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
409
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
410
                          file_.mimetype != MIMETYPE_PAGE):
411
                        pass # keep PAGE match
412
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
413
                          file_.mimetype == MIMETYPE_PAGE):
414
                        raise ValueError(
415
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
416
                                file_.pageId, ifg))
417
                    else:
418
                        # filter was inactive but no PAGE is in control, this must not happen
419 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
420
                            ift[i] = None
421
                        elif on_error == 'first':
422
                            pass # keep first match
423
                        elif on_error == 'last':
424
                            ift[i] = file_
425
                        elif on_error == 'abort':
426
                            raise ValueError(
427
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
428
                                    file_.pageId, ifg))
429
                        else:
430
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
431
                else:
432
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
433
                    ift[i] = file_
434
        ifts = list()
435
        for page, ifiles in pages.items():
436
            for i, ifg in enumerate(ifgs):
437
                if not ifiles[i]:
438
                    # other fallback options?
439
                    LOG.error('found no page %s in file group %s',
440
                              page, ifg)
441
            if ifiles[0] or not require_first:
442
                ifts.append(tuple(ifiles))
443
        return ifts
444