Passed
Pull Request — master (#1240)
by
unknown
07:44
created

ocrd.processor.base.Processor.show_resource()   A

Complexity

Conditions 4

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
cc 4
nop 2
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import Optional
19
import sys
20
import tarfile
21
import io
22
from deprecated import deprecated
23
24
from ocrd.workspace import Workspace
25
from ocrd_utils import (
26
    VERSION as OCRD_VERSION,
27
    MIMETYPE_PAGE,
28
    MIME_TO_EXT,
29
    getLogger,
30
    initLogging,
31
    list_resource_candidates,
32
    pushd_popd,
33
    list_all_resources,
34
    get_processor_resource_types,
35
    resource_filename,
36
    make_file_id,
37
)
38
from ocrd_validators import ParameterValidator
39
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
40
from ocrd_modelfactory import page_from_file
41
42
# XXX imports must remain for backwards-compatibility
43
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
44
45
class ResourceNotFoundError(FileNotFoundError):
46
    """
47
    An exception signifying the requested processor resource
48
    cannot be resolved.
49
    """
50
    def __init__(self, name, executable):
51
        self.name = name
52
        self.executable = executable
53
        self.message = "Could not find resource '%s' for executable '%s'. " \
54
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
55
                       % (name, executable, executable, name)
56
        super().__init__(self.message)
57
58
class Processor():
59
    """
60
    A processor is a tool that implements the uniform OCR-D command-line interface
61
    for run-time data processing. That is, it executes a single workflow step,
62
    or a combination of workflow steps, on the workspace (represented by local METS).
63
    It reads input files for all or requested physical pages of the input fileGrp(s),
64
    and writes output files for them into the output fileGrp(s). It may take 
65
    a number of optional or mandatory parameters.
66
    """
67
68
    def __init__(
69
            self,
70
            # FIXME: deprecate in favor of process_workspace(workspace)
71
            workspace : Optional[Workspace],
72
            ocrd_tool=None,
73
            parameter=None,
74
            input_file_grp=None,
75
            output_file_grp=None,
76
            page_id=None,
77
            download_files=True,
78
            # FIXME: deprecate all the following in favor of respective methods
79
            resolve_resource=None,
80
            show_resource=None,
81
            list_resources=False,
82
            show_help=False,
83
            subcommand=None,
84
            show_version=False,
85
            dump_json=False,
86
            dump_module_dir=False,
87
            version=None
88
    ):
89
        """
90
        Instantiate, but do not process. Unless ``list_resources`` or
91
        ``show_resource`` or ``show_help`` or ``show_version`` or
92
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
93
        (parsing and validating parameters, entering the workspace directory).
94
95
        Args:
96
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
97
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
98
                 but then needs to be set before running.
99
        Keyword Args:
100
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
101
                 Can be ``None`` for processing, but needs to be set before running.
102
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
103
                 Can be ``None`` even for processing, but then needs to be set before running.
104
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
105
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
106
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
107
                 (or empty for all pages).
108
             download_files (boolean): Whether input files will be downloaded prior to processing.
109
             resolve_resource (string): If not ``None``, then instead of processing, resolve \
110
                 given resource by name and print its full path to stdout.
111
             show_resource (string): If not ``None``, then instead of processing, resolve \
112
                 given resource by name and print its contents to stdout.
113
             list_resources (boolean): If true, then instead of processing, find all installed \
114
                 resource files in the search paths and print their path names.
115
             show_help (boolean): If true, then instead of processing, print a usage description \
116
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
117
                 docstrings.
118
             subcommand (string): 'worker' or 'server', only used here for the right --help output
119
             show_version (boolean): If true, then instead of processing, print information on \
120
                 this processor's version and OCR-D version. Exit afterwards.
121
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
122
                 on stdout.
123
             dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
124
                 on stdout.
125
        """
126
        self.ocrd_tool = ocrd_tool
127
        if parameter is None:
128
            parameter = {}
129
        if dump_json:
130
            print(json.dumps(ocrd_tool, indent=True))
131
            return
132
        if dump_module_dir:
133
            print(self.moduledir)
134
            return
135
        if list_resources:
136
            for res in self.list_all_resources():
137
                print(res)
138
            return
139
        if resolve_resource:
140
            try:
141
                res = self.resolve_resource(resolve_resource)
142
                print(res)
143
            except ResourceNotFoundError as e:
144
                log = getLogger('ocrd.processor.base')
145
                log.critical(e.message)
146
                sys.exit(1)
147
            return
148
        if show_resource:
149
            try:
150
                self.show_resource(show_resource)
151
            except ResourceNotFoundError as e:
152
                log = getLogger('ocrd.processor.base')
153
                log.critical(e.message)
154
                sys.exit(1)
155
            return
156
        if show_help:
157
            self.show_help(subcommand=subcommand)
158
            return
159
        self.version = version
160
        if show_version:
161
            self.show_version()
162
            return
163
        self.workspace = workspace
164
        if self.workspace:
165
            # FIXME deprecate setting this and calling process() over using process_workspace()
166
            # which uses pushd_popd(self.workspace.directory)
167
            # (because there is no way to do that in process() since it's an
168
            # overridden method. chdir is almost always an anti-pattern.)
169
            self.old_pwd = getcwd()
170
            os.chdir(self.workspace.directory)
171
        self.input_file_grp = input_file_grp
172
        self.output_file_grp = output_file_grp
173
        self.page_id = None if page_id == [] or page_id is None else page_id
174
        self.download = download_files
175
        parameterValidator = ParameterValidator(ocrd_tool)
176
        report = parameterValidator.validate(parameter)
177
        if not report.is_valid:
178
            raise Exception("Invalid parameters %s" % report.errors)
179
        self.parameter = parameter
180
        # workaround for deprecated#72 (deprecation does not work for subclasses):
181
        setattr(self, 'process',
182
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
183
184
    def show_help(self, subcommand=None):
185
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
186
187
    def show_version(self):
188
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
189
190
    def verify(self):
191
        """
192
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
193
        """
194
        return True
195
196
    def setup(self) -> None:
197
        """
198
        Prepare the processor for actual data processing,
199
        prior to changing to the workspace directory but
200
        after parsing parameters.
201
202
        (Override this to load models into memory etc.)
203
        """
204
        pass
205
206
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
207
    def process(self) -> None:
208
        """
209
        Process all files of the :py:attr:`workspace` 
210
        from the given :py:attr:`input_file_grp`
211
        to the given :py:attr:`output_file_grp`
212
        for the given :py:attr:`page_id` (or all pages)
213
        under the given :py:attr:`parameter`.
214
        
215
        (This contains the main functionality and needs to be overridden by subclasses.)
216
        """
217
        raise NotImplementedError()
218
219
    def process_workspace(self, workspace: Workspace) -> None:
220
        """
221
        Process all files of the given ``workspace``,
222
        from the given :py:attr:`input_file_grp`
223
        to the given :py:attr:`output_file_grp`
224
        for the given :py:attr:`page_id` (or all pages)
225
        under the given :py:attr:`parameter`.
226
227
        (This will iterate over pages and files, calling
228
        :py:meth:`process_page`, handling exceptions.)
229
        """
230
        # assert self.input_file_grp is not None
231
        # assert self.output_file_grp is not None
232
        # input_file_grps = self.input_file_grp.split(',')
233
        # for input_file_grp in input_file_grps:
234
        #     assert input_file_grp in workspace.mets.file_groups
235
        log = getLogger('ocrd.processor.base')
236
        with pushd_popd(workspace.directory):
237
            self.workspace = workspace
238
            try:
239
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
240
                for input_file_tuple in self.zip_input_files(on_error='abort'):
241
                    # FIXME: add error handling by catching exceptions in various ways (#579)
242
                    # for example:
243
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
244
                    # - transient (I/O or OOM) error → maybe sleep, retry
245
                    # - persistent (data) error → skip / dummy / raise
246
                    input_files = [None] * len(input_file_tuple)
247
                    for i, input_file in enumerate(input_file_tuple):
248
                        if i == 0:
249
                            log.info("processing page %s", input_file.pageId)
250
                        elif input_file is None:
251
                            # file/page not found in this file grp
252
                            continue
253
                        input_files[i] = input_file
254
                        if not self.download:
255
                            continue
256
                        try:
257
                            input_files[i] = self.workspace.download_file(input_file)
258
                        except ValueError as e:
259
                            log.error(repr(e))
260
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
261
                    self.process_page_file(*input_files)
262
            except NotImplementedError:
263
                # fall back to deprecated method
264
                self.process()
265
266
    def process_page_file(self, *input_files) -> None:
267
        """
268
        Process the given ``input_files`` of the :py:attr:`workspace`,
269
        representing one physical page (passed as one opened
270
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
271
        under the given :py:attr:`parameter`, and make sure the
272
        results get added accordingly.
273
274
        (This uses process_page_pcgts, but can be overridden by subclasses
275
        to handle cases like multiple fileGrps, non-PAGE input etc.)
276
        """
277
        log = getLogger('ocrd.processor.base')
278
        input_pcgts = [None] * len(input_files)
279
        for i, input_file in enumerate(input_files):
280
            # FIXME: what about non-PAGE input like image or JSON ???
281
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
282
            try:
283
                input_pcgts[i] = page_from_file(input_file)
284
            except ValueError as e:
285
                log.info("non-PAGE input for page %s: %s", input_file.pageId, e)
286
        output_pcgts = self.process_page_pcgts(*input_pcgts)
287
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
288
        output_pcgts.set_pcGtsId(output_file_id)
289
        self.add_metadata(output_pcgts)
290
        # FIXME: what about save_image_file in process_page ???
291
        # FIXME: what about non-PAGE output like JSON ???
292
        self.workspace.add_file(file_id=output_file_id,
293
                                file_grp=self.output_file_grp,
294
                                page_id=input_files[0].pageId,
295
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
296
                                mimetype=MIMETYPE_PAGE,
297
                                content=to_xml(output_pcgts))
298
299
    def process_page_pcgts(self, *input_pcgts) -> OcrdPage:
300
        """
301
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
302
        representing one physical page (passed as one parsed
303
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
304
        under the given :py:attr:`parameter`, and return the
305
        resulting :py:class:`~ocrd_models.OcrdPage`.
306
307
        (This contains the main functionality and must be overridden by subclasses.)
308
        """
309
        raise NotImplementedError()
310
311
    def add_metadata(self, pcgts: OcrdPage) -> None:
312
        """
313
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
314
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
315
        """
316
        pcgts.get_Metadata().add_MetadataItem(
317
                MetadataItemType(type_="processingStep",
318
                    name=self.ocrd_tool['steps'][0],
319
                    value=self.ocrd_tool['executable'],
320
                    Labels=[LabelsType(
321
                        externalModel="ocrd-tool",
322
                        externalId="parameters",
323
                        Label=[LabelType(type_=name,
324
                                         value=self.parameter[name])
325
                               for name in self.parameter.keys()]),
326
                            LabelsType(
327
                        externalModel="ocrd-tool",
328
                        externalId="version",
329
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
330
                                         value=self.version),
331
                               LabelType(type_='ocrd/core',
332
                                         value=OCRD_VERSION)])
333
                    ]))
334
335
    def resolve_resource(self, val):
336
        """
337
        Resolve a resource name to an absolute file path with the algorithm in
338
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
339
340
        Args:
341
            val (string): resource value to resolve
342
        """
343
        initLogging()
344
        executable = self.ocrd_tool['executable']
345
        log = getLogger('ocrd.processor.base')
346
        if exists(val):
347
            log.debug("Resolved to absolute path %s" % val)
348
            return val
349
        if hasattr(self, 'old_pwd'):
350
            cwd = self.old_pwd
351
        else:
352
            cwd = getcwd()
353
        ret = [cand for cand in list_resource_candidates(executable, val,
354
                                                         cwd=cwd, moduled=self.moduledir)
355
               if exists(cand)]
356
        if ret:
357
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
358
            return ret[0]
359
        raise ResourceNotFoundError(val, executable)
360
361
    def show_resource(self, val):
362
        res_fname = self.resolve_resource(val)
363
        fpath = Path(res_fname)
364
        if fpath.is_dir():
365
            with pushd_popd(fpath):
366
                fileobj = io.BytesIO()
367
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
368
                    tarball.add('.')
369
                fileobj.seek(0)
370
                copyfileobj(fileobj, sys.stdout.buffer)
371
        else:
372
            sys.stdout.buffer.write(fpath.read_bytes())
373
374
    def list_all_resources(self):
375
        """
376
        List all resources found in the filesystem and matching content-type by filename suffix
377
        """
378
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
379
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
380
            res = Path(res)
381
            if not '*/*' in mimetypes:
382
                if res.is_dir() and not 'text/directory' in mimetypes:
383
                    continue
384
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
385
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
386
                                             for mime in mimetypes):
387
                    continue
388
            yield res
389
390
    @property
391
    def module(self):
392
        """
393
        The top-level module this processor belongs to.
394
        """
395
        # find shortest prefix path that is not just a namespace package
396
        fqname = ''
397
        for name in self.__module__.split('.'):
398
            if fqname:
399
                fqname += '.'
400
            fqname += name
401
            if getattr(sys.modules[fqname], '__file__', None):
402
                return fqname
403
        # fall-back
404
        return self.__module__
405
406
    @property
407
    def moduledir(self):
408
        """
409
        The filesystem path of the module directory.
410
        """
411
        return resource_filename(self.module, '.')
412
413
    @property
414
    def input_files(self):
415
        """
416
        List the input files (for single-valued :py:attr:`input_file_grp`).
417
418
        For each physical page:
419
420
        - If there is a single PAGE-XML for the page, take it (and forget about all
421
          other files for that page)
422
        - Else if there is a single image file, take it (and forget about all other
423
          files for that page)
424
        - Otherwise raise an error (complaining that only PAGE-XML warrants
425
          having multiple images for a single page)
426
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
427
        
428
        Returns:
429
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
430
        """
431
        if not self.input_file_grp:
432
            raise ValueError("Processor is missing input fileGrp")
433
        ret = self.zip_input_files(mimetype=None, on_error='abort')
434
        if not ret:
435
            return []
436
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
437
        return [tuples[0] for tuples in ret]
438
439
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
440
        """
441
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
442
443
        Processors that expect/need multiple input file groups,
444
        cannot use :py:data:`input_files`. They must align (zip) input files
445
        across pages. This includes the case where not all pages
446
        are equally present in all file groups. It also requires
447
        making a consistent selection if there are multiple files
448
        per page.
449
450
        Following the OCR-D functional model, this function tries to
451
        find a single PAGE file per page, or fall back to a single
452
        image file per page. In either case, multiple matches per page
453
        are an error (see error handling below).
454
        This default behaviour can be changed by using a fixed MIME
455
        type filter via :py:attr:`mimetype`. But still, multiple matching
456
        files per page are an error.
457
458
        Single-page multiple-file errors are handled according to
459
        :py:attr:`on_error`:
460
461
        - if ``skip``, then the page for the respective fileGrp will be
462
          silently skipped (as if there was no match at all)
463
        - if ``first``, then the first matching file for the page will be
464
          silently selected (as if the first was the only match)
465
        - if ``last``, then the last matching file for the page will be
466
          silently selected (as if the last was the only match)
467
        - if ``abort``, then an exception will be raised.
468
        Multiple matches for PAGE-XML will always raise an exception.
469
470
        Keyword Args:
471
             require_first (boolean): If true, then skip a page entirely
472
                 whenever it is not available in the first input `fileGrp`.
473
             mimetype (string): If not `None`, filter by the specified MIME
474
                 type (literal or regex prefixed by `//`). Otherwise prefer
475
                 PAGE or image.
476
        Returns:
477
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
478
        """
479
        if not self.input_file_grp:
480
            raise ValueError("Processor is missing input fileGrp")
481
482
        LOG = getLogger('ocrd.processor.base')
483
        ifgs = self.input_file_grp.split(",")
484
        # Iterating over all files repeatedly may seem inefficient at first sight,
485
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
486
        # can actually be much more costly than traversing the ltree.
487
        # This might depend on the number of pages vs number of fileGrps.
488
489
        pages = dict()
490
        for i, ifg in enumerate(ifgs):
491
            files_ = sorted(self.workspace.mets.find_all_files(
492
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
493
                                # sort by MIME type so PAGE comes before images
494
                                key=lambda file_: file_.mimetype)
495
            # Warn if no files found but pageId was specified because that
496
            # might be because of invalid page_id (range)
497
            if self.page_id and not files_:
498
                msg = (f"Could not find any files for --page-id {self.page_id} - "
499
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
500
                if on_error == 'abort':
501
                    raise ValueError(msg)
502
                LOG.warning(msg)
503
            for file_ in files_:
504
                if not file_.pageId:
505
                    continue
506
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
507
                if ift[i]:
508
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
509
                    # fileGrp has multiple files for this page ID
510
                    if mimetype:
511
                        # filter was active, this must not happen
512 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
513
                            ift[i] = None
514
                        elif on_error == 'first':
515
                            pass # keep first match
516
                        elif on_error == 'last':
517
                            ift[i] = file_
518
                        elif on_error == 'abort':
519
                            raise ValueError(
520
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
521
                                    mimetype, file_.pageId, ifg))
522
                        else:
523
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
524
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
525
                          file_.mimetype != MIMETYPE_PAGE):
526
                        pass # keep PAGE match
527
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
528
                          file_.mimetype == MIMETYPE_PAGE):
529
                        raise ValueError(
530
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
531
                                file_.pageId, ifg))
532
                    else:
533
                        # filter was inactive but no PAGE is in control, this must not happen
534 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
535
                            ift[i] = None
536
                        elif on_error == 'first':
537
                            pass # keep first match
538
                        elif on_error == 'last':
539
                            ift[i] = file_
540
                        elif on_error == 'abort':
541
                            raise ValueError(
542
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
543
                                    file_.pageId, ifg))
544
                        else:
545
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
546
                else:
547
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
548
                    ift[i] = file_
549
        ifts = list()
550
        for page, ifiles in pages.items():
551
            for i, ifg in enumerate(ifgs):
552
                if not ifiles[i]:
553
                    # other fallback options?
554
                    LOG.error('found no page %s in file group %s',
555
                              page, ifg)
556
            if ifiles[0] or not require_first:
557
                ifts.append(tuple(ifiles))
558
        return ifts
559