Passed
Pull Request — master (#1240)
by
unknown
03:40
created

ocrd.processor.base.Processor.input_files()   A

Complexity

Conditions 3

Size

Total Lines 26
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 26
rs 9.95
c 0
b 0
f 0
cc 3
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
27
from click import wrap_text
28
from deprecated import deprecated
29
from requests import HTTPError
30
31
from ocrd.workspace import Workspace
32
from ocrd_models.ocrd_file import OcrdFileType
33
from ocrd.processor.ocrd_page_result import OcrdPageResult
34
from ocrd_utils import (
35
    VERSION as OCRD_VERSION,
36
    MIMETYPE_PAGE,
37
    MIME_TO_EXT,
38
    config,
39
    getLogger,
40
    list_resource_candidates,
41
    pushd_popd,
42
    list_all_resources,
43
    get_processor_resource_types,
44
    resource_filename,
45
    parse_json_file_with_comments,
46
    make_file_id,
47
    deprecation_warning
48
)
49
from ocrd_validators import ParameterValidator
50
from ocrd_models.ocrd_page import (
51
    PageType,
52
    AlternativeImageType,
53
    MetadataItemType,
54
    LabelType,
55
    LabelsType,
56
    OcrdPage,
57
    to_xml,
58
)
59
from ocrd_modelfactory import page_from_file
60
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
61
62
# XXX imports must remain for backwards-compatibility
63
from .helpers import run_cli, run_processor # pylint: disable=unused-import
64
65
66
class ResourceNotFoundError(FileNotFoundError):
67
    """
68
    An exception signifying the requested processor resource
69
    cannot be resolved.
70
    """
71
    def __init__(self, name, executable):
72
        self.name = name
73
        self.executable = executable
74
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
75
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
76
        super().__init__(self.message)
77
78
class NonUniqueInputFile(ValueError):
79
    """
80
    An exception signifying the specified fileGrp / pageId / mimetype
81
    selector yields multiple PAGE files, or no PAGE files but multiple images,
82
    or multiple files of that mimetype.
83
    """
84
    def __init__(self, fileGrp, pageId, mimetype):
85
        self.fileGrp = fileGrp
86
        self.pageId = pageId
87
        self.mimetype = mimetype
88
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
89
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
90
        super().__init__(self.message)
91
92
class MissingInputFile(ValueError):
93
    """
94
    An exception signifying the specified fileGrp / pageId / mimetype
95
    selector yields no PAGE files, or no PAGE and no image files,
96
    or no files of that mimetype.
97
    """
98
    def __init__(self, fileGrp, pageId, mimetype):
99
        self.fileGrp = fileGrp
100
        self.pageId = pageId
101
        self.mimetype = mimetype
102
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
103
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
104
        super().__init__(self.message)
105
106
class Processor():
107
    """
108
    A processor is a tool that implements the uniform OCR-D
109
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
110
111
    That is, it executes a single workflow step, or a combination of workflow steps,
112
    on the workspace (represented by local METS). It reads input files for all or selected
113
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
114
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
115
    parameters.
116
    """
117
118
    max_instances : int = -1
119
    """
120
    maximum number of cached instances (ignored if negative), to be applied on top of
121
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
122
123
    (Override this if you know how many instances fit into memory at once.)
124
    """
125
126
    @property
127
    def metadata_filename(self) -> str:
128
        """
129
        Relative location of the ``ocrd-tool.json`` file inside the package.
130
131
        Used by :py:data:`metadata_location`.
132
133
        (Override if ``ocrd-tool.json`` is not in the root of the module,
134
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
135
        """
136
        return 'ocrd-tool.json'
137
138
    @cached_property
139
    def metadata_location(self) -> Path:
140
        """
141
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
142
143
        Used by :py:data:`metadata_rawdict`.
144
145
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
146
        """
147
        # XXX HACK
148
        module_tokens = self.__module__.split('.')
149
        if module_tokens[0] == 'src':
150
            module_tokens.pop(0)
151
        return resource_filename(module_tokens[0], self.metadata_filename)
152
153
    @cached_property
154
    def metadata_rawdict(self) -> dict:
155
        """
156
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
157
158
        Used by :py:data:`metadata`.
159
160
        (Override if ``ocrd-tool.json`` is not in a file.)
161
        """
162
        return parse_json_file_with_comments(self.metadata_location)
163
164
    @cached_property
165
    def metadata(self) -> dict:
166
        """
167
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
168
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
169
170
        After deserialisation, it also gets validated against the
171
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
172
        expanded.
173
174
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
175
176
        (Override if you want to provide metadata programmatically instead of a
177
        JSON file.)
178
        """
179
        metadata = self.metadata_rawdict
180
        report = OcrdToolValidator.validate(metadata)
181
        if not report.is_valid:
182
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
183
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
184
        return metadata
185
186
    @cached_property
187
    def version(self) -> str:
188
        """
189
        The program version of the package.
190
        Usually the ``version`` part of :py:data:`metadata`.
191
192
        (Override if you do not want to use :py:data:`metadata` lookup
193
        mechanism.)
194
        """
195
        return self.metadata['version']
196
197
    @cached_property
198
    def executable(self) -> str:
199
        """
200
        The executable name of this processor tool. Taken from the runtime
201
        filename.
202
203
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
204
205
        (Override if your entry-point name deviates from the ``executable``
206
        name, or the processor gets instantiated from another runtime.)
207
        """
208
        return os.path.basename(inspect.stack()[-1].filename)
209
210
    @cached_property
211
    def ocrd_tool(self) -> dict:
212
        """
213
        The ``ocrd-tool.json`` dict contents of this processor tool.
214
        Usually the :py:data:`executable` key of the ``tools`` part
215
        of :py:data:`metadata`.
216
217
        (Override if you do not want to use :py:data:`metadata` lookup
218
        mechanism.)
219
        """
220
        return self.metadata['tools'][self.executable]
221
222
    @property
223
    def parameter(self) -> Optional[dict]:
224
        """the runtime parameter dict to be used by this processor"""
225
        if hasattr(self, '_parameter'):
226
            return self._parameter
227
        return None
228
229
    @parameter.setter
230
    def parameter(self, parameter : dict) -> None:
231
        if self.parameter is not None:
232
            self.shutdown()
233
        parameterValidator = ParameterValidator(self.ocrd_tool)
234
        report = parameterValidator.validate(parameter)
235
        if not report.is_valid:
236
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
237
        # make parameter dict read-only
238
        self._parameter = frozendict(parameter)
239
        # (re-)run setup to load models etc
240
        self.setup()
241
242
    def __init__(
243
            self,
244
            # FIXME: remove in favor of process_workspace(workspace)
245
            workspace : Optional[Workspace],
246
            ocrd_tool=None,
247
            parameter=None,
248
            input_file_grp=None,
249
            output_file_grp=None,
250
            page_id=None,
251
            download_files=config.OCRD_DOWNLOAD_INPUT,
252
            version=None
253
    ):
254
        """
255
        Instantiate, but do not setup (neither for processing nor other usage).
256
        If given, do parse and validate :py:data:`.parameter`.
257
258
        Args:
259
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
260
                 If not ``None``, then `chdir` to that directory.
261
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
262
                 before processing.
263
        Keyword Args:
264
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
265
                 Can be ``None`` even for processing, but then needs to be set before running.
266
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
267
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
268
                 before processing.
269
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
270
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
271
                 before processing.
272
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
273
                 (or empty for all pages). \
274
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
275
                 before processing.
276
             download_files (boolean): Whether input files will be downloaded prior to processing, \
277
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
278
        """
279
        if ocrd_tool is not None:
280
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
281
                                "use or override metadata/executable/ocrd-tool properties instead")
282
            self._ocrd_tool = ocrd_tool
283
            self._executable = ocrd_tool['executable']
284
        if version is not None:
285
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
286
                                "use or override metadata/version properties instead")
287
            self._version = version
288
        if workspace is not None:
289
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
290
                                "is deprecated - pass as argument to process_workspace instead")
291
            self.workspace = workspace
292
            self.old_pwd = getcwd()
293
            os.chdir(self.workspace.directory)
294
        if input_file_grp is not None:
295
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
296
                                "is deprecated - pass as argument to process_workspace instead")
297
            self.input_file_grp = input_file_grp
298
        if output_file_grp is not None:
299
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
300
                                "is deprecated - pass as argument to process_workspace instead")
301
            self.output_file_grp = output_file_grp
302
        if page_id is not None:
303
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
304
                                "is deprecated - pass as argument to process_workspace instead")
305
            self.page_id = page_id or None
306
        self.download = download_files
307
        #: The logger to be used by processor implementations.
308
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
309
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
310
        self._base_logger = getLogger('ocrd.processor.base')
311
        if parameter is not None:
312
            self.parameter = parameter
313
        # ensure that shutdown gets called at destruction
314
        self._finalizer = weakref.finalize(self, self.shutdown)
315
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
316
        setattr(self, 'process',
317
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
318
319
    def show_help(self, subcommand=None):
320
        """
321
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
322
        parameters and docstrings.
323
        """
324
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
325
326
    def show_version(self):
327
        """
328
        Print information on this processor's version and OCR-D version.
329
        """
330
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
331
332
    def verify(self):
333
        """
334
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
335
        """
336
        assert self.input_file_grp is not None
337
        assert self.output_file_grp is not None
338
        input_file_grps = self.input_file_grp.split(',')
339
        output_file_grps = self.output_file_grp.split(',')
340
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
341
            if isinstance(spec, int):
342
                if spec > 0:
343
                    assert len(grps) == spec, msg % (len(grps), str(spec))
344
            else:
345
                assert isinstance(spec, list)
346
                minimum = spec[0]
347
                maximum = spec[1]
348
                if minimum > 0:
349
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
350
                if maximum > 0:
351
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
352
        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
353
                                    "Unexpected number of input file groups %d vs %s")
354
        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
355
                                    "Unexpected number of output file groups %d vs %s")
356
        for input_file_grp in input_file_grps:
357
            assert input_file_grp in self.workspace.mets.file_groups
358
        # keep this for backwards compatibility:
359
        return True
360
361
    def dump_json(self):
362
        """
363
        Print :py:attr:`ocrd_tool` on stdout.
364
        """
365
        print(json.dumps(self.ocrd_tool, indent=True))
366
367
    def dump_module_dir(self):
368
        """
369
        Print :py:attr:`moduledir` on stdout.
370
        """
371
        print(self.moduledir)
372
373
    def list_resources(self):
374
        """
375
        Find all installed resource files in the search paths and print their path names.
376
        """
377
        for res in self.list_all_resources():
378
            print(res)
379
380
    def setup(self) -> None:
381
        """
382
        Prepare the processor for actual data processing,
383
        prior to changing to the workspace directory but
384
        after parsing parameters.
385
386
        (Override this to load models into memory etc.)
387
        """
388
        pass
389
390
    def shutdown(self) -> None:
391
        """
392
        Bring down the processor after data processing,
393
        after to changing back from the workspace directory but
394
        before exiting (or setting up with different parameters).
395
396
        (Override this to unload models from memory etc.)
397
        """
398
        pass
399
400
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
401
    def process(self) -> None:
402
        """
403
        Process all files of the :py:data:`workspace`
404
        from the given :py:data:`input_file_grp`
405
        to the given :py:data:`output_file_grp`
406
        for the given :py:data:`page_id` (or all pages)
407
        under the given :py:data:`parameter`.
408
409
        (This contains the main functionality and needs to be
410
        overridden by subclasses.)
411
        """
412
        raise NotImplementedError()
413
414
    def process_workspace(self, workspace: Workspace) -> None:
415
        """
416
        Process all files of the given ``workspace``,
417
        from the given :py:data:`input_file_grp`
418
        to the given :py:data:`output_file_grp`
419
        for the given :py:data:`page_id` (or all pages)
420
        under the given :py:data:`parameter`.
421
422
        (This will iterate over pages and files, calling
423
        :py:meth:`.process_page_file` and handling exceptions.
424
        It should be overridden by subclasses to handle cases
425
        like post-processing or computation across pages.)
426
        """
427
        with pushd_popd(workspace.directory):
428
            self.workspace = workspace
429
            self.verify()
430
            try:
431
                nr_succeeded = 0
432
                nr_skipped = 0
433
                nr_copied = 0
434
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
435
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
436
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
437
                    page_id = next(input_file.pageId
438
                                   for input_file in input_file_tuple
439
                                   if input_file)
440
                    self._base_logger.info(f"processing page {page_id}")
441
                    for i, input_file in enumerate(input_file_tuple):
442
                        if input_file is None:
443
                            # file/page not found in this file grp
444
                            continue
445
                        input_files[i] = input_file
446
                        if not self.download:
447
                            continue
448
                        try:
449
                            input_files[i] = self.workspace.download_file(input_file)
450
                        except (ValueError, FileNotFoundError, HTTPError) as e:
451
                            self._base_logger.error(repr(e))
452
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
453
                    # FIXME: differentiate error cases in various ways:
454
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
455
                    # - transient (I/O or OOM) error → maybe sleep, retry
456
                    # - persistent (data) error → skip / dummy / raise
457
                    try:
458
                        self.process_page_file(*input_files)
459
                        nr_succeeded += 1
460
                    # exclude NotImplementedError, so we can try process() below
461
                    except NotImplementedError:
462
                        raise
463
                    # handle input failures separately
464
                    except FileExistsError as err:
465
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
466
                            raise err
467
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
468
                            continue
469
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
470
                            # too late here, must not happen
471
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
472
                    # broad coverage of output failures
473
                    except Exception as err:
474
                        # FIXME: add re-usable/actionable logging
475
                        self._base_logger.exception(f"Failure on page {page_id}: {err}")
476
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
477
                            raise err
478
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
479
                            nr_skipped += 1
480
                            continue
481
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
482
                            self._copy_page_file(input_files[0])
483
                            nr_copied += 1
484
                        else:
485
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
486
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
487
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
488
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
489
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
490
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
491
            except NotImplementedError:
492
                # fall back to deprecated method
493
                self.process()
494
495
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
496
        """
497
        Copy the given ``input_file`` of the :py:data:`workspace`,
498
        representing one physical page (passed as one opened
499
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
500
        and add it as if it was a processing result.
501
        """
502
        input_pcgts : OcrdPage
503
        assert isinstance(input_file, get_args(OcrdFileType))
504
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
505
        try:
506
            input_pcgts = page_from_file(input_file)
507
        except ValueError as err:
508
            # not PAGE and not an image to generate PAGE for
509
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
510
            return
511
        output_file_id = make_file_id(input_file, self.output_file_grp)
512
        input_pcgts.set_pcGtsId(output_file_id)
513
        self.add_metadata(input_pcgts)
514
        self.workspace.add_file(file_id=output_file_id,
515
                                file_grp=self.output_file_grp,
516
                                page_id=input_file.pageId,
517
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
518
                                mimetype=MIMETYPE_PAGE,
519
                                content=to_xml(input_pcgts),
520
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
521
        )
522
523
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
524
        """
525
        Process the given ``input_files`` of the :py:data:`workspace`,
526
        representing one physical page (passed as one opened
527
        :py:class:`.OcrdFile` per input fileGrp)
528
        under the given :py:data:`.parameter`, and make sure the
529
        results get added accordingly.
530
531
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
532
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
533
        """
534
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
535
        assert isinstance(input_files[0], get_args(OcrdFileType))
536
        page_id = input_files[0].pageId
537
        for i, input_file in enumerate(input_files):
538
            assert isinstance(input_file, get_args(OcrdFileType))
539
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
540
            try:
541
                page_ = page_from_file(input_file)
542
                assert isinstance(page_, OcrdPage)
543
                input_pcgts[i] = page_
544
            except ValueError as err:
545
                # not PAGE and not an image to generate PAGE for
546
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
547
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
548
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
549
        for image_result in result.images:
550
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
551
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
552
            if isinstance(image_result.alternative_image, PageType):
553
                # special case: not an alternative image, but replacing the original image
554
                # (this is needed by certain processors when the original's coordinate system
555
                #  cannot or must not be kept)
556
                image_result.alternative_image.set_imageFilename(image_file_path)
557
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
558
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
559
            elif isinstance(image_result.alternative_image, AlternativeImageType):
560
                image_result.alternative_image.set_filename(image_file_path)
561
            else:
562
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
563
                                 f"{type(image_result.alternative_image)}")
564
            self.workspace.save_image_file(
565
                image_result.pil,
566
                image_file_id,
567
                self.output_file_grp,
568
                page_id=page_id,
569
                file_path=image_file_path,
570
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
571
            )
572
        result.pcgts.set_pcGtsId(output_file_id)
573
        self.add_metadata(result.pcgts)
574
        self.workspace.add_file(file_id=output_file_id,
575
                                file_grp=self.output_file_grp,
576
                                page_id=page_id,
577
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
578
                                mimetype=MIMETYPE_PAGE,
579
                                content=to_xml(result.pcgts),
580
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
581
        )
582
583
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
584
        """
585
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
586
        representing one physical page (passed as one parsed
587
        :py:class:`.OcrdPage` per input fileGrp)
588
        under the given :py:data:`.parameter`, and return the
589
        resulting :py:class:`.OcrdPageResult`.
590
591
        Optionally, add to the ``images`` attribute of the resulting
592
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
593
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
594
        ``file_id_suffix`` (used for generating IDs of the saved image) and
595
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
596
        for setting the filename of the saved image).
597
598
        (This contains the main functionality and must be overridden by subclasses,
599
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
600
        """
601
        raise NotImplementedError()
602
603
    def add_metadata(self, pcgts: OcrdPage) -> None:
604
        """
605
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
606
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
607
        """
608
        metadata_obj = pcgts.get_Metadata()
609
        assert metadata_obj is not None
610
        metadata_obj.add_MetadataItem(
611
                MetadataItemType(type_="processingStep",
612
                    name=self.ocrd_tool['steps'][0],
613
                    value=self.ocrd_tool['executable'],
614
                    Labels=[LabelsType(
615
                        externalModel="ocrd-tool",
616
                        externalId="parameters",
617
                        Label=[LabelType(type_=name,
618
                                         value=self.parameter[name])
619
                               for name in self.parameter.keys()]),
620
                            LabelsType(
621
                        externalModel="ocrd-tool",
622
                        externalId="version",
623
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
624
                                         value=self.version),
625
                               LabelType(type_='ocrd/core',
626
                                         value=OCRD_VERSION)])
627
                    ]))
628
629
    def resolve_resource(self, val):
630
        """
631
        Resolve a resource name to an absolute file path with the algorithm in
632
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
633
634
        Args:
635
            val (string): resource value to resolve
636
        """
637
        executable = self.ocrd_tool['executable']
638
        if exists(val):
639
            self._base_logger.debug("Resolved to absolute path %s" % val)
640
            return val
641
        # FIXME: remove once workspace arg / old_pwd is gone:
642
        if hasattr(self, 'old_pwd'):
643
            cwd = self.old_pwd
644
        else:
645
            cwd = getcwd()
646
        ret = [cand for cand in list_resource_candidates(executable, val,
647
                                                         cwd=cwd, moduled=self.moduledir)
648
               if exists(cand)]
649
        if ret:
650
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
651
            return ret[0]
652
        raise ResourceNotFoundError(val, executable)
653
654
    def show_resource(self, val):
655
        """
656
        Resolve a resource name to a file path with the algorithm in
657
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
658
        then print its contents to stdout.
659
660
        Args:
661
            val (string): resource value to show
662
        """
663
        res_fname = self.resolve_resource(val)
664
        fpath = Path(res_fname)
665
        if fpath.is_dir():
666
            with pushd_popd(fpath):
667
                fileobj = io.BytesIO()
668
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
669
                    tarball.add('.')
670
                fileobj.seek(0)
671
                copyfileobj(fileobj, sys.stdout.buffer)
672
        else:
673
            sys.stdout.buffer.write(fpath.read_bytes())
674
675
    def list_all_resources(self):
676
        """
677
        List all resources found in the filesystem and matching content-type by filename suffix
678
        """
679
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
680
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
681
            res = Path(res)
682
            if not '*/*' in mimetypes:
683
                if res.is_dir() and not 'text/directory' in mimetypes:
684
                    continue
685
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
686
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
687
                                             for mime in mimetypes):
688
                    continue
689
            yield res
690
691
    @property
692
    def module(self):
693
        """
694
        The top-level module this processor belongs to.
695
        """
696
        # find shortest prefix path that is not just a namespace package
697
        fqname = ''
698
        for name in self.__module__.split('.'):
699
            if fqname:
700
                fqname += '.'
701
            fqname += name
702
            if getattr(sys.modules[fqname], '__file__', None):
703
                return fqname
704
        # fall-back
705
        return self.__module__
706
707
    @property
708
    def moduledir(self):
709
        """
710
        The filesystem path of the module directory.
711
        """
712
        return resource_filename(self.module, '.')
713
714
    @property
715
    def input_files(self):
716
        """
717
        List the input files (for single-valued :py:attr:`input_file_grp`).
718
719
        For each physical page:
720
721
        - If there is a single PAGE-XML for the page, take it (and forget about all
722
          other files for that page)
723
        - Else if there is a single image file, take it (and forget about all other
724
          files for that page)
725
        - Otherwise raise an error (complaining that only PAGE-XML warrants
726
          having multiple images for a single page)
727
728
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
729
730
        Returns:
731
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
732
        """
733
        if not self.input_file_grp:
734
            raise ValueError("Processor is missing input fileGrp")
735
        ret = self.zip_input_files(mimetype=None, on_error='abort')
736
        if not ret:
737
            return []
738
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
739
        return [tuples[0] for tuples in ret]
740
741
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
742
        """
743
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
744
745
        Processors that expect/need multiple input file groups,
746
        cannot use :py:data:`input_files`. They must align (zip) input files
747
        across pages. This includes the case where not all pages
748
        are equally present in all file groups. It also requires
749
        making a consistent selection if there are multiple files
750
        per page.
751
752
        Following the OCR-D functional model, this function tries to
753
        find a single PAGE file per page, or fall back to a single
754
        image file per page. In either case, multiple matches per page
755
        are an error (see error handling below).
756
        This default behaviour can be changed by using a fixed MIME
757
        type filter via :py:attr:`mimetype`. But still, multiple matching
758
        files per page are an error.
759
760
        Single-page multiple-file errors are handled according to
761
        :py:attr:`on_error`:
762
763
        - if ``skip``, then the page for the respective fileGrp will be
764
          silently skipped (as if there was no match at all)
765
        - if ``first``, then the first matching file for the page will be
766
          silently selected (as if the first was the only match)
767
        - if ``last``, then the last matching file for the page will be
768
          silently selected (as if the last was the only match)
769
        - if ``abort``, then an exception will be raised.
770
771
        Multiple matches for PAGE-XML will always raise an exception.
772
773
        Keyword Args:
774
             require_first (boolean): If true, then skip a page entirely
775
                 whenever it is not available in the first input `fileGrp`.
776
             on_error (string): How to handle multiple file matches per page.
777
             mimetype (string): If not `None`, filter by the specified MIME
778
                 type (literal or regex prefixed by `//`). Otherwise prefer
779
                 PAGE or image.
780
        Returns:
781
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
782
        """
783
        if not self.input_file_grp:
784
            raise ValueError("Processor is missing input fileGrp")
785
786
        ifgs = self.input_file_grp.split(",")
787
        # Iterating over all files repeatedly may seem inefficient at first sight,
788
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
789
        # can actually be much more costly than traversing the ltree.
790
        # This might depend on the number of pages vs number of fileGrps.
791
792
        pages = {}
793
        for i, ifg in enumerate(ifgs):
794
            files_ = sorted(self.workspace.mets.find_all_files(
795
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
796
                                # sort by MIME type so PAGE comes before images
797
                                key=lambda file_: file_.mimetype)
798
            for file_ in files_:
799
                if not file_.pageId:
800
                    # ignore document-global files
801
                    continue
802
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
803
                if ift[i]:
804
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
805
                    # fileGrp has multiple files for this page ID
806
                    if mimetype:
807
                        # filter was active, this must not happen
808
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
809
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
810 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
811
                            ift[i] = None
812
                        elif on_error == 'first':
813
                            pass # keep first match
814
                        elif on_error == 'last':
815
                            ift[i] = file_
816
                        elif on_error == 'abort':
817
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
818
                        else:
819
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
820
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
821
                          file_.mimetype != MIMETYPE_PAGE):
822
                        pass # keep PAGE match
823
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
824
                          file_.mimetype == MIMETYPE_PAGE):
825
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
826
                    else:
827
                        # filter was inactive but no PAGE is in control, this must not happen
828
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
829
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
830 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
831
                            ift[i] = None
832
                        elif on_error == 'first':
833
                            pass # keep first match
834
                        elif on_error == 'last':
835
                            ift[i] = file_
836
                        elif on_error == 'abort':
837
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
838
                        else:
839
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
840
                else:
841
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
842
                    ift[i] = file_
843
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
844
        if self.page_id and not any(pages):
845
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
846
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
847
        ifts = []
848
        for page, ifiles in pages.items():
849
            for i, ifg in enumerate(ifgs):
850
                if not ifiles[i]:
851
                    # could be from non-unique with on_error=skip or from true gap
852
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
853
                    if config.OCRD_MISSING_INPUT == 'abort':
854
                        raise MissingInputFile(ifg, page, mimetype)
855
            if not any(ifiles):
856
                # must be from non-unique with on_error=skip
857
                self._base_logger.warning(f'Found no files for {page} - skipping')
858
                continue
859
            if ifiles[0] or not require_first:
860
                ifts.append(tuple(ifiles))
861
        return ifts
862
863
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
864
    """Generate a string describing the full CLI of this processor including params.
865
866
    Args:
867
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
868
         processor_instance (object, optional): the processor implementation
869
             (for adding any module/class/function docstrings)
870
        subcommand (string): 'worker' or 'server'
871
    """
872
    doc_help = ''
873
    if processor_instance:
874
        module = inspect.getmodule(processor_instance)
875
        if module and module.__doc__:
876
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
877
        if processor_instance.__doc__:
878
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
879
        # Try to find the most concrete docstring among the various methods that an implementation
880
        # could overload, first serving.
881
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
882
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
883
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
884
            instance_method = getattr(processor_instance, method)
885
            superclass_method = getattr(Processor, method)
886
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
887
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
888
                break
889
        if doc_help:
890
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
891
                                          initial_indent='  > ',
892
                                          subsequent_indent='  > ',
893
                                          preserve_paragraphs=True)
894
    subcommands = '''\
895
    worker      Start a processing worker rather than do local processing
896
    server      Start a processor server rather than do local processing
897
'''
898
899
    processing_worker_options = '''\
900
  --queue                         The RabbitMQ server address in format
901
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
902
                                  [amqp://admin:admin@localhost:5672]
903
  --database                      The MongoDB server address in format
904
                                  "mongodb://{host}:{port}"
905
                                  [mongodb://localhost:27018]
906
  --log-filename                  Filename to redirect STDOUT/STDERR to,
907
                                  if specified.
908
'''
909
910
    processing_server_options = '''\
911
  --address                       The Processor server address in format
912
                                  "{host}:{port}"
913
  --database                      The MongoDB server address in format
914
                                  "mongodb://{host}:{port}"
915
                                  [mongodb://localhost:27018]
916
'''
917
918
    processing_options = '''\
919
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
920
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
921
  -I, --input-file-grp USE        File group(s) used as input
922
  -O, --output-file-grp USE       File group(s) used as output
923
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
924
  --overwrite                     Remove existing output pages/images
925
                                  (with "--page-id", remove only those).
926
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
927
  --debug                         Abort on any errors with full stack trace.
928
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
929
  --profile                       Enable profiling
930
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
931
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
932
                                  or JSON file path
933
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
934
                                  taking precedence over --parameter
935
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
936
                                  If URL starts with http:// start an HTTP server there,
937
                                  otherwise URL is a path to an on-demand-created unix socket
938
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
939
                                  Override log level globally [INFO]
940
'''
941
942
    information_options = '''\
943
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
944
  -L, --list-resources            List names of processor resources
945
  -J, --dump-json                 Dump tool description as JSON
946
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
947
  -h, --help                      Show this message
948
  -V, --version                   Show version
949
'''
950
951
    parameter_help = ''
952
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
953
        parameter_help = '  NONE\n'
954
    else:
955
        def wrap(s):
956
            return wrap_text(s, initial_indent=' '*3,
957
                             subsequent_indent=' '*4,
958
                             width=72, preserve_paragraphs=True)
959
        for param_name, param in ocrd_tool['parameters'].items():
960
            parameter_help += wrap('"%s" [%s%s]' % (
961
                param_name,
962
                param['type'],
963
                ' - REQUIRED' if 'required' in param and param['required'] else
964
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
965
            parameter_help += '\n ' + wrap(param['description'])
966
            if 'enum' in param:
967
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
968
            parameter_help += "\n"
969
970
    if not subcommand:
971
        return f'''\
972
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
973
974
  {ocrd_tool['description']}{doc_help}
975
976
Subcommands:
977
{subcommands}
978
Options for processing:
979
{processing_options}
980
Options for information:
981
{information_options}
982
Parameters:
983
{parameter_help}
984
'''
985
    elif subcommand == 'worker':
986
        return f'''\
987
Usage: {ocrd_tool['executable']} worker [OPTIONS]
988
989
  Run {ocrd_tool['executable']} as a processing worker.
990
991
  {ocrd_tool['description']}{doc_help}
992
993
Options:
994
{processing_worker_options}
995
'''
996
    elif subcommand == 'server':
997
        return f'''\
998
Usage: {ocrd_tool['executable']} server [OPTIONS]
999
1000
  Run {ocrd_tool['executable']} as a processor sever.
1001
1002
  {ocrd_tool['description']}{doc_help}
1003
1004
Options:
1005
{processing_server_options}
1006
'''
1007
    else:
1008
        pass
1009