Passed
Pull Request — master (#1240)
by
unknown
02:48
created

ocrd.processor.base.Processor.resolve_resource()   A

Complexity

Conditions 4

Size

Total Lines 24
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 15
dl 0
loc 24
rs 9.65
c 0
b 0
f 0
cc 4
nop 2
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
27
from click import wrap_text
28
from deprecated import deprecated
29
from requests import HTTPError
30
31
from ocrd.workspace import Workspace
32
from ocrd_models.ocrd_file import OcrdFileType
33
from ocrd.processor.ocrd_page_result import OcrdPageResult
34
from ocrd_utils import (
35
    VERSION as OCRD_VERSION,
36
    MIMETYPE_PAGE,
37
    MIME_TO_EXT,
38
    config,
39
    getLogger,
40
    list_resource_candidates,
41
    pushd_popd,
42
    list_all_resources,
43
    get_processor_resource_types,
44
    resource_filename,
45
    parse_json_file_with_comments,
46
    make_file_id,
47
    deprecation_warning
48
)
49
from ocrd_validators import ParameterValidator
50
from ocrd_models.ocrd_page import (
51
    PageType,
52
    AlternativeImageType,
53
    MetadataItemType,
54
    LabelType,
55
    LabelsType,
56
    OcrdPage,
57
    to_xml,
58
)
59
from ocrd_modelfactory import page_from_file
60
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
61
62
# XXX imports must remain for backwards-compatibility
63
from .helpers import run_cli, run_processor # pylint: disable=unused-import
64
65
66
class ResourceNotFoundError(FileNotFoundError):
67
    """
68
    An exception signifying the requested processor resource
69
    cannot be resolved.
70
    """
71
    def __init__(self, name, executable):
72
        self.name = name
73
        self.executable = executable
74
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
75
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
76
        super().__init__(self.message)
77
78
class NonUniqueInputFile(ValueError):
79
    """
80
    An exception signifying the specified fileGrp / pageId / mimetype
81
    selector yields multiple PAGE files, or no PAGE files but multiple images,
82
    or multiple files of that mimetype.
83
    """
84
    def __init__(self, fileGrp, pageId, mimetype):
85
        self.fileGrp = fileGrp
86
        self.pageId = pageId
87
        self.mimetype = mimetype
88
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
89
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
90
        super().__init__(self.message)
91
92
class MissingInputFile(ValueError):
93
    """
94
    An exception signifying the specified fileGrp / pageId / mimetype
95
    selector yields no PAGE files, or no PAGE and no image files,
96
    or no files of that mimetype.
97
    """
98
    def __init__(self, fileGrp, pageId, mimetype):
99
        self.fileGrp = fileGrp
100
        self.pageId = pageId
101
        self.mimetype = mimetype
102
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
103
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
104
        super().__init__(self.message)
105
106
class Processor():
107
    """
108
    A processor is a tool that implements the uniform OCR-D
109
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
110
111
    That is, it executes a single workflow step, or a combination of workflow steps,
112
    on the workspace (represented by local METS). It reads input files for all or selected
113
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
114
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
115
    parameters.
116
    """
117
118
    max_instances : int = -1
119
    """
120
    maximum number of cached instances (ignored if negative), to be applied on top of
121
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
122
123
    (Override this if you know how many instances fit into memory at once.)
124
    """
125
126
    @property
127
    def metadata_filename(self) -> str:
128
        """
129
        Relative location of the ``ocrd-tool.json`` file inside the package.
130
131
        Used by :py:data:`metadata_location`.
132
133
        (Override if ``ocrd-tool.json`` is not in the root of the module,
134
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
135
        """
136
        return 'ocrd-tool.json'
137
138
    @cached_property
139
    def metadata_location(self) -> Path:
140
        """
141
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
142
143
        Used by :py:data:`metadata_rawdict`.
144
145
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
146
        """
147
        return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
148
149
    @cached_property
150
    def metadata_rawdict(self) -> dict:
151
        """
152
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
153
154
        Used by :py:data:`metadata`.
155
156
        (Override if ``ocrd-tool.json`` is not in a file.)
157
        """
158
        return parse_json_file_with_comments(self.metadata_location)
159
160
    @cached_property
161
    def metadata(self) -> dict:
162
        """
163
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
164
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
165
166
        After deserialisation, it also gets validated against the
167
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
168
        expanded.
169
170
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
171
172
        (Override if you want to provide metadata programmatically instead of a
173
        JSON file.)
174
        """
175
        metadata = self.metadata_rawdict
176
        report = OcrdToolValidator.validate(metadata)
177
        if not report.is_valid:
178
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
179
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
180
        return metadata
181
182
    @cached_property
183
    def version(self) -> str:
184
        """
185
        The program version of the package.
186
        Usually the ``version`` part of :py:data:`metadata`.
187
188
        (Override if you do not want to use :py:data:`metadata` lookup
189
        mechanism.)
190
        """
191
        return self.metadata['version']
192
193
    @cached_property
194
    def executable(self) -> str:
195
        """
196
        The executable name of this processor tool. Taken from the runtime
197
        filename.
198
199
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
200
201
        (Override if your entry-point name deviates from the ``executable``
202
        name, or the processor gets instantiated from another runtime.)
203
        """
204
        return os.path.basename(inspect.stack()[-1].filename)
205
206
    @cached_property
207
    def ocrd_tool(self) -> dict:
208
        """
209
        The ``ocrd-tool.json`` dict contents of this processor tool.
210
        Usually the :py:data:`executable` key of the ``tools`` part
211
        of :py:data:`metadata`.
212
213
        (Override if you do not want to use :py:data:`metadata` lookup
214
        mechanism.)
215
        """
216
        return self.metadata['tools'][self.executable]
217
218
    @property
219
    def parameter(self) -> Optional[dict]:
220
        """the runtime parameter dict to be used by this processor"""
221
        if hasattr(self, '_parameter'):
222
            return self._parameter
223
        return None
224
225
    @parameter.setter
226
    def parameter(self, parameter : dict) -> None:
227
        if self.parameter is not None:
228
            self.shutdown()
229
        parameterValidator = ParameterValidator(self.ocrd_tool)
230
        report = parameterValidator.validate(parameter)
231
        if not report.is_valid:
232
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
233
        # make parameter dict read-only
234
        self._parameter = frozendict(parameter)
235
        # (re-)run setup to load models etc
236
        self.setup()
237
238
    def __init__(
239
            self,
240
            # FIXME: remove in favor of process_workspace(workspace)
241
            workspace : Optional[Workspace],
242
            ocrd_tool=None,
243
            parameter=None,
244
            input_file_grp=None,
245
            output_file_grp=None,
246
            page_id=None,
247
            download_files=config.OCRD_DOWNLOAD_INPUT,
248
            version=None
249
    ):
250
        """
251
        Instantiate, but do not setup (neither for processing nor other usage).
252
        If given, do parse and validate :py:data:`.parameter`.
253
254
        Args:
255
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
256
                 If not ``None``, then `chdir` to that directory.
257
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
258
                 before processing.
259
        Keyword Args:
260
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
261
                 Can be ``None`` even for processing, but then needs to be set before running.
262
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
263
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
264
                 before processing.
265
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
266
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
267
                 before processing.
268
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
269
                 (or empty for all pages). \
270
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
271
                 before processing.
272
             download_files (boolean): Whether input files will be downloaded prior to processing, \
273
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
274
        """
275
        if ocrd_tool is not None:
276
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
277
                                "use or override metadata/executable/ocrd-tool properties instead")
278
            self._ocrd_tool = ocrd_tool
279
            self._executable = ocrd_tool['executable']
280
        if version is not None:
281
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
282
                                "use or override metadata/version properties instead")
283
            self._version = version
284
        if workspace is not None:
285
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
286
                                "is deprecated - pass as argument to process_workspace instead")
287
            self.workspace = workspace
288
            self.old_pwd = getcwd()
289
            os.chdir(self.workspace.directory)
290
        if input_file_grp is not None:
291
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
292
                                "is deprecated - pass as argument to process_workspace instead")
293
            self.input_file_grp = input_file_grp
294
        if output_file_grp is not None:
295
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
296
                                "is deprecated - pass as argument to process_workspace instead")
297
            self.output_file_grp = output_file_grp
298
        if page_id is not None:
299
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
300
                                "is deprecated - pass as argument to process_workspace instead")
301
            self.page_id = page_id or None
302
        self.download = download_files
303
        #: The logger to be used by processor implementations.
304
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
305
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
306
        self._base_logger = getLogger('ocrd.processor.base')
307
        if parameter is not None:
308
            self.parameter = parameter
309
        # ensure that shutdown gets called at destruction
310
        self._finalizer = weakref.finalize(self, self.shutdown)
311
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
312
        setattr(self, 'process',
313
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
314
315
    def show_help(self, subcommand=None):
316
        """
317
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
318
        parameters and docstrings.
319
        """
320
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
321
322
    def show_version(self):
323
        """
324
        Print information on this processor's version and OCR-D version.
325
        """
326
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
327
328
    def verify(self):
329
        """
330
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
331
        """
332
        assert self.input_file_grp is not None
333
        assert self.output_file_grp is not None
334
        input_file_grps = self.input_file_grp.split(',')
335
        output_file_grps = self.output_file_grp.split(',')
336
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
337
            if isinstance(spec, int):
338
                if spec > 0:
339
                    assert len(grps) == spec, msg % (len(grps), str(spec))
340
            else:
341
                assert isinstance(spec, list)
342
                minimum = spec[0]
343
                maximum = spec[1]
344
                if minimum > 0:
345
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
346
                if maximum > 0:
347
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
348
        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
349
                                    "Unexpected number of input file groups %d vs %s")
350
        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
351
                                    "Unexpected number of output file groups %d vs %s")
352
        for input_file_grp in input_file_grps:
353
            assert input_file_grp in self.workspace.mets.file_groups
354
        # keep this for backwards compatibility:
355
        return True
356
357
    def dump_json(self):
358
        """
359
        Print :py:attr:`ocrd_tool` on stdout.
360
        """
361
        print(json.dumps(self.ocrd_tool, indent=True))
362
363
    def dump_module_dir(self):
364
        """
365
        Print :py:attr:`moduledir` on stdout.
366
        """
367
        print(self.moduledir)
368
369
    def list_resources(self):
370
        """
371
        Find all installed resource files in the search paths and print their path names.
372
        """
373
        for res in self.list_all_resources():
374
            print(res)
375
376
    def setup(self) -> None:
377
        """
378
        Prepare the processor for actual data processing,
379
        prior to changing to the workspace directory but
380
        after parsing parameters.
381
382
        (Override this to load models into memory etc.)
383
        """
384
        pass
385
386
    def shutdown(self) -> None:
387
        """
388
        Bring down the processor after data processing,
389
        after to changing back from the workspace directory but
390
        before exiting (or setting up with different parameters).
391
392
        (Override this to unload models from memory etc.)
393
        """
394
        pass
395
396
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
397
    def process(self) -> None:
398
        """
399
        Process all files of the :py:data:`workspace`
400
        from the given :py:data:`input_file_grp`
401
        to the given :py:data:`output_file_grp`
402
        for the given :py:data:`page_id` (or all pages)
403
        under the given :py:data:`parameter`.
404
405
        (This contains the main functionality and needs to be
406
        overridden by subclasses.)
407
        """
408
        raise NotImplementedError()
409
410
    def process_workspace(self, workspace: Workspace) -> None:
411
        """
412
        Process all files of the given ``workspace``,
413
        from the given :py:data:`input_file_grp`
414
        to the given :py:data:`output_file_grp`
415
        for the given :py:data:`page_id` (or all pages)
416
        under the given :py:data:`parameter`.
417
418
        (This will iterate over pages and files, calling
419
        :py:meth:`.process_page_file` and handling exceptions.
420
        It should be overridden by subclasses to handle cases
421
        like post-processing or computation across pages.)
422
        """
423
        with pushd_popd(workspace.directory):
424
            self.workspace = workspace
425
            self.verify()
426
            try:
427
                nr_succeeded = 0
428
                nr_skipped = 0
429
                nr_copied = 0
430
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
431
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
432
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
433
                    page_id = next(input_file.pageId
434
                                   for input_file in input_file_tuple
435
                                   if input_file)
436
                    self._base_logger.info(f"processing page {page_id}")
437
                    for i, input_file in enumerate(input_file_tuple):
438
                        if input_file is None:
439
                            # file/page not found in this file grp
440
                            continue
441
                        input_files[i] = input_file
442
                        if not self.download:
443
                            continue
444
                        try:
445
                            input_files[i] = self.workspace.download_file(input_file)
446
                        except (ValueError, FileNotFoundError, HTTPError) as e:
447
                            self._base_logger.error(repr(e))
448
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
449
                    # FIXME: differentiate error cases in various ways:
450
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
451
                    # - transient (I/O or OOM) error → maybe sleep, retry
452
                    # - persistent (data) error → skip / dummy / raise
453
                    try:
454
                        self.process_page_file(*input_files)
455
                        nr_succeeded += 1
456
                    # exclude NotImplementedError, so we can try process() below
457
                    except NotImplementedError:
458
                        raise
459
                    # handle input failures separately
460
                    except FileExistsError as err:
461
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
462
                            raise err
463
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
464
                            continue
465
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
466
                            # too late here, must not happen
467
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
468
                    # broad coverage of output failures
469
                    except Exception as err:
470
                        # FIXME: add re-usable/actionable logging
471
                        self._base_logger.exception(f"Failure on page {page_id}: {err}")
472
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
473
                            raise err
474
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
475
                            nr_skipped += 1
476
                            continue
477
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
478
                            self._copy_page_file(input_files[0])
479
                            nr_copied += 1
480
                        else:
481
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
482
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
483
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
484
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
485
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
486
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
487
            except NotImplementedError:
488
                # fall back to deprecated method
489
                self.process()
490
491
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
492
        """
493
        Copy the given ``input_file`` of the :py:data:`workspace`,
494
        representing one physical page (passed as one opened
495
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
496
        and add it as if it was a processing result.
497
        """
498
        input_pcgts : OcrdPage
499
        assert isinstance(input_file, get_args(OcrdFileType))
500
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
501
        try:
502
            input_pcgts = page_from_file(input_file)
503
        except ValueError as err:
504
            # not PAGE and not an image to generate PAGE for
505
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
506
            return
507
        output_file_id = make_file_id(input_file, self.output_file_grp)
508
        input_pcgts.set_pcGtsId(output_file_id)
509
        self.add_metadata(input_pcgts)
510
        self.workspace.add_file(file_id=output_file_id,
511
                                file_grp=self.output_file_grp,
512
                                page_id=input_file.pageId,
513
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
514
                                mimetype=MIMETYPE_PAGE,
515
                                content=to_xml(input_pcgts),
516
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
517
        )
518
519
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
520
        """
521
        Process the given ``input_files`` of the :py:data:`workspace`,
522
        representing one physical page (passed as one opened
523
        :py:class:`.OcrdFile` per input fileGrp)
524
        under the given :py:data:`.parameter`, and make sure the
525
        results get added accordingly.
526
527
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
528
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
529
        """
530
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
531
        assert isinstance(input_files[0], get_args(OcrdFileType))
532
        page_id = input_files[0].pageId
533
        for i, input_file in enumerate(input_files):
534
            assert isinstance(input_file, get_args(OcrdFileType))
535
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
536
            try:
537
                page_ = page_from_file(input_file)
538
                assert isinstance(page_, OcrdPage)
539
                input_pcgts[i] = page_
540
            except ValueError as err:
541
                # not PAGE and not an image to generate PAGE for
542
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
543
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
544
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
545
        for image_result in result.images:
546
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
547
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
548
            if isinstance(image_result.alternative_image, PageType):
549
                # special case: not an alternative image, but replacing the original image
550
                # (this is needed by certain processors when the original's coordinate system
551
                #  cannot or must not be kept)
552
                image_result.alternative_image.set_imageFilename(image_file_path)
553
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
554
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
555
            elif isinstance(image_result.alternative_image, AlternativeImageType):
556
                image_result.alternative_image.set_filename(image_file_path)
557
            else:
558
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
559
                                 f"{type(image_result.alternative_image)}")
560
            self.workspace.save_image_file(
561
                image_result.pil,
562
                image_file_id,
563
                self.output_file_grp,
564
                page_id=page_id,
565
                file_path=image_file_path,
566
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
567
            )
568
        result.pcgts.set_pcGtsId(output_file_id)
569
        self.add_metadata(result.pcgts)
570
        self.workspace.add_file(file_id=output_file_id,
571
                                file_grp=self.output_file_grp,
572
                                page_id=page_id,
573
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
574
                                mimetype=MIMETYPE_PAGE,
575
                                content=to_xml(result.pcgts),
576
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
577
        )
578
579
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
580
        """
581
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
582
        representing one physical page (passed as one parsed
583
        :py:class:`.OcrdPage` per input fileGrp)
584
        under the given :py:data:`.parameter`, and return the
585
        resulting :py:class:`.OcrdPageResult`.
586
587
        Optionally, add to the ``images`` attribute of the resulting
588
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
589
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
590
        ``file_id_suffix`` (used for generating IDs of the saved image) and
591
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
592
        for setting the filename of the saved image).
593
594
        (This contains the main functionality and must be overridden by subclasses,
595
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
596
        """
597
        raise NotImplementedError()
598
599
    def add_metadata(self, pcgts: OcrdPage) -> None:
600
        """
601
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
602
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
603
        """
604
        metadata_obj = pcgts.get_Metadata()
605
        assert metadata_obj is not None
606
        metadata_obj.add_MetadataItem(
607
                MetadataItemType(type_="processingStep",
608
                    name=self.ocrd_tool['steps'][0],
609
                    value=self.ocrd_tool['executable'],
610
                    Labels=[LabelsType(
611
                        externalModel="ocrd-tool",
612
                        externalId="parameters",
613
                        Label=[LabelType(type_=name,
614
                                         value=self.parameter[name])
615
                               for name in self.parameter.keys()]),
616
                            LabelsType(
617
                        externalModel="ocrd-tool",
618
                        externalId="version",
619
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
620
                                         value=self.version),
621
                               LabelType(type_='ocrd/core',
622
                                         value=OCRD_VERSION)])
623
                    ]))
624
625
    def resolve_resource(self, val):
626
        """
627
        Resolve a resource name to an absolute file path with the algorithm in
628
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
629
630
        Args:
631
            val (string): resource value to resolve
632
        """
633
        executable = self.ocrd_tool['executable']
634
        if exists(val):
635
            self._base_logger.debug("Resolved to absolute path %s" % val)
636
            return val
637
        # FIXME: remove once workspace arg / old_pwd is gone:
638
        if hasattr(self, 'old_pwd'):
639
            cwd = self.old_pwd
640
        else:
641
            cwd = getcwd()
642
        ret = [cand for cand in list_resource_candidates(executable, val,
643
                                                         cwd=cwd, moduled=self.moduledir)
644
               if exists(cand)]
645
        if ret:
646
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
647
            return ret[0]
648
        raise ResourceNotFoundError(val, executable)
649
650
    def show_resource(self, val):
651
        """
652
        Resolve a resource name to a file path with the algorithm in
653
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
654
        then print its contents to stdout.
655
656
        Args:
657
            val (string): resource value to show
658
        """
659
        res_fname = self.resolve_resource(val)
660
        fpath = Path(res_fname)
661
        if fpath.is_dir():
662
            with pushd_popd(fpath):
663
                fileobj = io.BytesIO()
664
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
665
                    tarball.add('.')
666
                fileobj.seek(0)
667
                copyfileobj(fileobj, sys.stdout.buffer)
668
        else:
669
            sys.stdout.buffer.write(fpath.read_bytes())
670
671
    def list_all_resources(self):
672
        """
673
        List all resources found in the filesystem and matching content-type by filename suffix
674
        """
675
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
676
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
677
            res = Path(res)
678
            if not '*/*' in mimetypes:
679
                if res.is_dir() and not 'text/directory' in mimetypes:
680
                    continue
681
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
682
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
683
                                             for mime in mimetypes):
684
                    continue
685
            yield res
686
687
    @property
688
    def module(self):
689
        """
690
        The top-level module this processor belongs to.
691
        """
692
        # find shortest prefix path that is not just a namespace package
693
        fqname = ''
694
        for name in self.__module__.split('.'):
695
            if fqname:
696
                fqname += '.'
697
            fqname += name
698
            if getattr(sys.modules[fqname], '__file__', None):
699
                return fqname
700
        # fall-back
701
        return self.__module__
702
703
    @property
704
    def moduledir(self):
705
        """
706
        The filesystem path of the module directory.
707
        """
708
        return resource_filename(self.module, '.')
709
710
    @property
711
    def input_files(self):
712
        """
713
        List the input files (for single-valued :py:attr:`input_file_grp`).
714
715
        For each physical page:
716
717
        - If there is a single PAGE-XML for the page, take it (and forget about all
718
          other files for that page)
719
        - Else if there is a single image file, take it (and forget about all other
720
          files for that page)
721
        - Otherwise raise an error (complaining that only PAGE-XML warrants
722
          having multiple images for a single page)
723
724
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
725
726
        Returns:
727
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
728
        """
729
        if not self.input_file_grp:
730
            raise ValueError("Processor is missing input fileGrp")
731
        ret = self.zip_input_files(mimetype=None, on_error='abort')
732
        if not ret:
733
            return []
734
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
735
        return [tuples[0] for tuples in ret]
736
737
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
738
        """
739
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
740
741
        Processors that expect/need multiple input file groups,
742
        cannot use :py:data:`input_files`. They must align (zip) input files
743
        across pages. This includes the case where not all pages
744
        are equally present in all file groups. It also requires
745
        making a consistent selection if there are multiple files
746
        per page.
747
748
        Following the OCR-D functional model, this function tries to
749
        find a single PAGE file per page, or fall back to a single
750
        image file per page. In either case, multiple matches per page
751
        are an error (see error handling below).
752
        This default behaviour can be changed by using a fixed MIME
753
        type filter via :py:attr:`mimetype`. But still, multiple matching
754
        files per page are an error.
755
756
        Single-page multiple-file errors are handled according to
757
        :py:attr:`on_error`:
758
759
        - if ``skip``, then the page for the respective fileGrp will be
760
          silently skipped (as if there was no match at all)
761
        - if ``first``, then the first matching file for the page will be
762
          silently selected (as if the first was the only match)
763
        - if ``last``, then the last matching file for the page will be
764
          silently selected (as if the last was the only match)
765
        - if ``abort``, then an exception will be raised.
766
767
        Multiple matches for PAGE-XML will always raise an exception.
768
769
        Keyword Args:
770
             require_first (boolean): If true, then skip a page entirely
771
                 whenever it is not available in the first input `fileGrp`.
772
             on_error (string): How to handle multiple file matches per page.
773
             mimetype (string): If not `None`, filter by the specified MIME
774
                 type (literal or regex prefixed by `//`). Otherwise prefer
775
                 PAGE or image.
776
        Returns:
777
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
778
        """
779
        if not self.input_file_grp:
780
            raise ValueError("Processor is missing input fileGrp")
781
782
        ifgs = self.input_file_grp.split(",")
783
        # Iterating over all files repeatedly may seem inefficient at first sight,
784
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
785
        # can actually be much more costly than traversing the ltree.
786
        # This might depend on the number of pages vs number of fileGrps.
787
788
        pages = {}
789
        for i, ifg in enumerate(ifgs):
790
            files_ = sorted(self.workspace.mets.find_all_files(
791
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
792
                                # sort by MIME type so PAGE comes before images
793
                                key=lambda file_: file_.mimetype)
794
            for file_ in files_:
795
                if not file_.pageId:
796
                    # ignore document-global files
797
                    continue
798
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
799
                if ift[i]:
800
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
801
                    # fileGrp has multiple files for this page ID
802
                    if mimetype:
803
                        # filter was active, this must not happen
804
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
805
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
806 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
807
                            ift[i] = None
808
                        elif on_error == 'first':
809
                            pass # keep first match
810
                        elif on_error == 'last':
811
                            ift[i] = file_
812
                        elif on_error == 'abort':
813
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
814
                        else:
815
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
816
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
817
                          file_.mimetype != MIMETYPE_PAGE):
818
                        pass # keep PAGE match
819
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
820
                          file_.mimetype == MIMETYPE_PAGE):
821
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
822
                    else:
823
                        # filter was inactive but no PAGE is in control, this must not happen
824
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
825
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
826 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
827
                            ift[i] = None
828
                        elif on_error == 'first':
829
                            pass # keep first match
830
                        elif on_error == 'last':
831
                            ift[i] = file_
832
                        elif on_error == 'abort':
833
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
834
                        else:
835
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
836
                else:
837
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
838
                    ift[i] = file_
839
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
840
        if self.page_id and not any(pages):
841
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
842
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
843
        ifts = []
844
        for page, ifiles in pages.items():
845
            for i, ifg in enumerate(ifgs):
846
                if not ifiles[i]:
847
                    # could be from non-unique with on_error=skip or from true gap
848
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
849
                    if config.OCRD_MISSING_INPUT == 'abort':
850
                        raise MissingInputFile(ifg, page, mimetype)
851
            if not any(ifiles):
852
                # must be from non-unique with on_error=skip
853
                self._base_logger.warning(f'Found no files for {page} - skipping')
854
                continue
855
            if ifiles[0] or not require_first:
856
                ifts.append(tuple(ifiles))
857
        return ifts
858
859
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
860
    """Generate a string describing the full CLI of this processor including params.
861
862
    Args:
863
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
864
         processor_instance (object, optional): the processor implementation
865
             (for adding any module/class/function docstrings)
866
        subcommand (string): 'worker' or 'server'
867
    """
868
    doc_help = ''
869
    if processor_instance:
870
        module = inspect.getmodule(processor_instance)
871
        if module and module.__doc__:
872
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
873
        if processor_instance.__doc__:
874
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
875
        # Try to find the most concrete docstring among the various methods that an implementation
876
        # could overload, first serving.
877
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
878
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
879
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
880
            instance_method = getattr(processor_instance, method)
881
            superclass_method = getattr(Processor, method)
882
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
883
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
884
                break
885
        if doc_help:
886
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
887
                                          initial_indent='  > ',
888
                                          subsequent_indent='  > ',
889
                                          preserve_paragraphs=True)
890
    subcommands = '''\
891
    worker      Start a processing worker rather than do local processing
892
    server      Start a processor server rather than do local processing
893
'''
894
895
    processing_worker_options = '''\
896
  --queue                         The RabbitMQ server address in format
897
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
898
                                  [amqp://admin:admin@localhost:5672]
899
  --database                      The MongoDB server address in format
900
                                  "mongodb://{host}:{port}"
901
                                  [mongodb://localhost:27018]
902
  --log-filename                  Filename to redirect STDOUT/STDERR to,
903
                                  if specified.
904
'''
905
906
    processing_server_options = '''\
907
  --address                       The Processor server address in format
908
                                  "{host}:{port}"
909
  --database                      The MongoDB server address in format
910
                                  "mongodb://{host}:{port}"
911
                                  [mongodb://localhost:27018]
912
'''
913
914
    processing_options = '''\
915
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
916
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
917
  -I, --input-file-grp USE        File group(s) used as input
918
  -O, --output-file-grp USE       File group(s) used as output
919
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
920
  --overwrite                     Remove existing output pages/images
921
                                  (with "--page-id", remove only those).
922
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
923
  --debug                         Abort on any errors with full stack trace.
924
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
925
  --profile                       Enable profiling
926
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
927
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
928
                                  or JSON file path
929
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
930
                                  taking precedence over --parameter
931
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
932
                                  If URL starts with http:// start an HTTP server there,
933
                                  otherwise URL is a path to an on-demand-created unix socket
934
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
935
                                  Override log level globally [INFO]
936
'''
937
938
    information_options = '''\
939
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
940
  -L, --list-resources            List names of processor resources
941
  -J, --dump-json                 Dump tool description as JSON
942
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
943
  -h, --help                      Show this message
944
  -V, --version                   Show version
945
'''
946
947
    parameter_help = ''
948
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
949
        parameter_help = '  NONE\n'
950
    else:
951
        def wrap(s):
952
            return wrap_text(s, initial_indent=' '*3,
953
                             subsequent_indent=' '*4,
954
                             width=72, preserve_paragraphs=True)
955
        for param_name, param in ocrd_tool['parameters'].items():
956
            parameter_help += wrap('"%s" [%s%s]' % (
957
                param_name,
958
                param['type'],
959
                ' - REQUIRED' if 'required' in param and param['required'] else
960
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
961
            parameter_help += '\n ' + wrap(param['description'])
962
            if 'enum' in param:
963
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
964
            parameter_help += "\n"
965
966
    if not subcommand:
967
        return f'''\
968
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
969
970
  {ocrd_tool['description']}{doc_help}
971
972
Subcommands:
973
{subcommands}
974
Options for processing:
975
{processing_options}
976
Options for information:
977
{information_options}
978
Parameters:
979
{parameter_help}
980
'''
981
    elif subcommand == 'worker':
982
        return f'''\
983
Usage: {ocrd_tool['executable']} worker [OPTIONS]
984
985
  Run {ocrd_tool['executable']} as a processing worker.
986
987
  {ocrd_tool['description']}{doc_help}
988
989
Options:
990
{processing_worker_options}
991
'''
992
    elif subcommand == 'server':
993
        return f'''\
994
Usage: {ocrd_tool['executable']} server [OPTIONS]
995
996
  Run {ocrd_tool['executable']} as a processor sever.
997
998
  {ocrd_tool['description']}{doc_help}
999
1000
Options:
1001
{processing_server_options}
1002
'''
1003
    else:
1004
        pass
1005