Passed
Pull Request — master (#1240)
by Konstantin
03:02
created

ocrd.processor.base.Processor.verify()   B

Complexity

Conditions 6

Size

Total Lines 28
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 23
dl 0
loc 28
rs 8.3946
c 0
b 0
f 0
cc 6
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
from deprecated import deprecated
27
from requests import HTTPError
28
29
from ocrd.workspace import Workspace
30
from ocrd_models.ocrd_file import OcrdFileType
31
from ocrd.processor.ocrd_page_result import OcrdPageResult
32
from ocrd_utils import (
33
    VERSION as OCRD_VERSION,
34
    MIMETYPE_PAGE,
35
    MIME_TO_EXT,
36
    config,
37
    getLogger,
38
    list_resource_candidates,
39
    pushd_popd,
40
    list_all_resources,
41
    get_processor_resource_types,
42
    resource_filename,
43
    parse_json_file_with_comments,
44
    make_file_id,
45
    deprecation_warning
46
)
47
from ocrd_validators import ParameterValidator
48
from ocrd_models.ocrd_page import (
49
    PageType,
50
    AlternativeImageType,
51
    MetadataItemType,
52
    LabelType,
53
    LabelsType,
54
    OcrdPage,
55
    to_xml,
56
)
57
from ocrd_modelfactory import page_from_file
58
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
59
60
# XXX imports must remain for backwards-compatibility
61
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
62
63
64
class ResourceNotFoundError(FileNotFoundError):
65
    """
66
    An exception signifying the requested processor resource
67
    cannot be resolved.
68
    """
69
    def __init__(self, name, executable):
70
        self.name = name
71
        self.executable = executable
72
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
73
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
74
        super().__init__(self.message)
75
76
class NonUniqueInputFile(ValueError):
77
    """
78
    An exception signifying the specified fileGrp / pageId / mimetype
79
    selector yields multiple PAGE files, or no PAGE files but multiple images,
80
    or multiple files of that mimetype.
81
    """
82
    def __init__(self, fileGrp, pageId, mimetype):
83
        self.fileGrp = fileGrp
84
        self.pageId = pageId
85
        self.mimetype = mimetype
86
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
87
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
88
        super().__init__(self.message)
89
90
class MissingInputFile(ValueError):
91
    """
92
    An exception signifying the specified fileGrp / pageId / mimetype
93
    selector yields no PAGE files, or no PAGE and no image files,
94
    or no files of that mimetype.
95
    """
96
    def __init__(self, fileGrp, pageId, mimetype):
97
        self.fileGrp = fileGrp
98
        self.pageId = pageId
99
        self.mimetype = mimetype
100
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
101
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
102
        super().__init__(self.message)
103
104
class Processor():
105
    """
106
    A processor is a tool that implements the uniform OCR-D
107
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
108
109
    That is, it executes a single workflow step, or a combination of workflow steps,
110
    on the workspace (represented by local METS). It reads input files for all or selected
111
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
112
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
113
    parameters.
114
    """
115
116
    max_instances : int = -1
117
    """
118
    maximum number of cached instances (ignored if negative), to be applied on top of
119
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
120
121
    (Override this if you know how many instances fit into memory at once.)
122
    """
123
124
    @property
125
    def metadata_filename(self) -> str:
126
        """
127
        Relative location of the ``ocrd-tool.json`` file inside the package.
128
129
        Used by :py:data:`metadata_location`.
130
131
        (Override if ``ocrd-tool.json`` is not in the root of the module,
132
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
133
        """
134
        return 'ocrd-tool.json'
135
136
    @cached_property
137
    def metadata_location(self) -> Path:
138
        """
139
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
140
141
        Used by :py:data:`metadata_rawdict`.
142
143
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
144
        """
145
        return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
146
147
    @cached_property
148
    def metadata_rawdict(self) -> dict:
149
        """
150
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
151
152
        Used by :py:data:`metadata`.
153
154
        (Override if ``ocrd-tool.json`` is not in a file.)
155
        """
156
        return parse_json_file_with_comments(self.metadata_location)
157
158
    @cached_property
159
    def metadata(self) -> dict:
160
        """
161
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
162
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
163
164
        After deserialisation, it also gets validated against the
165
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
166
        expanded.
167
168
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
169
170
        (Override if you want to provide metadata programmatically instead of a
171
        JSON file.)
172
        """
173
        metadata = self.metadata_rawdict
174
        report = OcrdToolValidator.validate(metadata)
175
        if not report.is_valid:
176
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
177
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
178
        return metadata
179
180
    @cached_property
181
    def version(self) -> str:
182
        """
183
        The program version of the package.
184
        Usually the ``version`` part of :py:data:`metadata`.
185
186
        (Override if you do not want to use :py:data:`metadata` lookup
187
        mechanism.)
188
        """
189
        return self.metadata['version']
190
191
    @cached_property
192
    def executable(self) -> str:
193
        """
194
        The executable name of this processor tool. Taken from the runtime
195
        filename.
196
197
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
198
199
        (Override if your entry-point name deviates from the ``executable``
200
        name, or the processor gets instantiated from another runtime.)
201
        """
202
        return os.path.basename(inspect.stack()[-1].filename)
203
204
    @cached_property
205
    def ocrd_tool(self) -> dict:
206
        """
207
        The ``ocrd-tool.json`` dict contents of this processor tool.
208
        Usually the :py:data:`executable` key of the ``tools`` part
209
        of :py:data:`metadata`.
210
211
        (Override if you do not want to use :py:data:`metadata` lookup
212
        mechanism.)
213
        """
214
        return self.metadata['tools'][self.executable]
215
216
    @property
217
    def parameter(self) -> Optional[dict]:
218
        """the runtime parameter dict to be used by this processor"""
219
        if hasattr(self, '_parameter'):
220
            return self._parameter
221
        return None
222
223
    @parameter.setter
224
    def parameter(self, parameter : dict) -> None:
225
        if self.parameter is not None:
226
            self.shutdown()
227
        parameterValidator = ParameterValidator(self.ocrd_tool)
228
        report = parameterValidator.validate(parameter)
229
        if not report.is_valid:
230
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
231
        # make parameter dict read-only
232
        self._parameter = frozendict(parameter)
233
        # (re-)run setup to load models etc
234
        self.setup()
235
236
    def __init__(
237
            self,
238
            # FIXME: remove in favor of process_workspace(workspace)
239
            workspace : Optional[Workspace],
240
            ocrd_tool=None,
241
            parameter=None,
242
            input_file_grp=None,
243
            output_file_grp=None,
244
            page_id=None,
245
            download_files=config.OCRD_DOWNLOAD_INPUT,
246
            version=None
247
    ):
248
        """
249
        Instantiate, but do not setup (neither for processing nor other usage).
250
        If given, do parse and validate :py:data:`.parameter`.
251
252
        Args:
253
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
254
                 If not ``None``, then `chdir` to that directory.
255
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
256
                 before processing.
257
        Keyword Args:
258
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
259
                 Can be ``None`` even for processing, but then needs to be set before running.
260
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
261
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
262
                 before processing.
263
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
264
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
265
                 before processing.
266
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
267
                 (or empty for all pages). \
268
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
269
                 before processing.
270
             download_files (boolean): Whether input files will be downloaded prior to processing, \
271
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
272
        """
273
        if ocrd_tool is not None:
274
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
275
                                "use or override metadata/executable/ocrd-tool properties instead")
276
            self._ocrd_tool = ocrd_tool
277
            self._executable = ocrd_tool['executable']
278
        if version is not None:
279
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
280
                                "use or override metadata/version properties instead")
281
            self._version = version
282
        if workspace is not None:
283
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
284
                                "is deprecated - pass as argument to process_workspace instead")
285
            self.workspace = workspace
286
            self.old_pwd = getcwd()
287
            os.chdir(self.workspace.directory)
288
        if input_file_grp is not None:
289
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
290
                                "is deprecated - pass as argument to process_workspace instead")
291
            self.input_file_grp = input_file_grp
292
        if output_file_grp is not None:
293
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
294
                                "is deprecated - pass as argument to process_workspace instead")
295
            self.output_file_grp = output_file_grp
296
        if page_id is not None:
297
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
298
                                "is deprecated - pass as argument to process_workspace instead")
299
            self.page_id = page_id or None
300
        self.download = download_files
301
        #: The logger to be used by processor implementations.
302
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
303
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
304
        self._base_logger = getLogger('ocrd.processor.base')
305
        if parameter is not None:
306
            self.parameter = parameter
307
        # ensure that shutdown gets called at destruction
308
        self._finalizer = weakref.finalize(self, self.shutdown)
309
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
310
        setattr(self, 'process',
311
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
312
313
    def show_help(self, subcommand=None):
314
        """
315
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
316
        parameters and docstrings.
317
        """
318
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
319
320
    def show_version(self):
321
        """
322
        Print information on this processor's version and OCR-D version.
323
        """
324
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
325
326
    def verify(self):
327
        """
328
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
329
        """
330
        assert self.input_file_grp is not None
331
        assert self.output_file_grp is not None
332
        input_file_grps = self.input_file_grp.split(',')
333
        output_file_grps = self.output_file_grp.split(',')
334
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
335
            if isinstance(spec, int):
336
                if spec > 0:
337
                    assert len(grps) == spec, msg % (len(grps), str(spec))
338
            else:
339
                assert isinstance(spec, list)
340
                minimum = spec[0]
341
                maximum = spec[1]
342
                if minimum > 0:
343
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
344
                if maximum > 0:
345
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
346
        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
347
                                    "Unexpected number of input file groups %d vs %s")
348
        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
349
                                    "Unexpected number of output file groups %d vs %s")
350
        for input_file_grp in input_file_grps:
351
            assert input_file_grp in self.workspace.mets.file_groups
352
        # keep this for backwards compatibility:
353
        return True
354
355
    def dump_json(self):
356
        """
357
        Print :py:attr:`ocrd_tool` on stdout.
358
        """
359
        print(json.dumps(self.ocrd_tool, indent=True))
360
361
    def dump_module_dir(self):
362
        """
363
        Print :py:attr:`moduledir` on stdout.
364
        """
365
        print(self.moduledir)
366
367
    def list_resources(self):
368
        """
369
        Find all installed resource files in the search paths and print their path names.
370
        """
371
        for res in self.list_all_resources():
372
            print(res)
373
374
    def setup(self) -> None:
375
        """
376
        Prepare the processor for actual data processing,
377
        prior to changing to the workspace directory but
378
        after parsing parameters.
379
380
        (Override this to load models into memory etc.)
381
        """
382
        pass
383
384
    def shutdown(self) -> None:
385
        """
386
        Bring down the processor after data processing,
387
        after to changing back from the workspace directory but
388
        before exiting (or setting up with different parameters).
389
390
        (Override this to unload models from memory etc.)
391
        """
392
        pass
393
394
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
395
    def process(self) -> None:
396
        """
397
        Process all files of the :py:data:`workspace`
398
        from the given :py:data:`input_file_grp`
399
        to the given :py:data:`output_file_grp`
400
        for the given :py:data:`page_id` (or all pages)
401
        under the given :py:data:`parameter`.
402
403
        (This contains the main functionality and needs to be
404
        overridden by subclasses.)
405
        """
406
        raise NotImplementedError()
407
408
    def process_workspace(self, workspace: Workspace) -> None:
409
        """
410
        Process all files of the given ``workspace``,
411
        from the given :py:data:`input_file_grp`
412
        to the given :py:data:`output_file_grp`
413
        for the given :py:data:`page_id` (or all pages)
414
        under the given :py:data:`parameter`.
415
416
        (This will iterate over pages and files, calling
417
        :py:meth:`.process_page_file` and handling exceptions.
418
        It should be overridden by subclasses to handle cases
419
        like post-processing or computation across pages.)
420
        """
421
        with pushd_popd(workspace.directory):
422
            self.workspace = workspace
423
            self.verify()
424
            try:
425
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
426
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
427
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
428
                    page_id = next(input_file.pageId
429
                                   for input_file in input_file_tuple
430
                                   if input_file)
431
                    self._base_logger.info(f"processing page {page_id}")
432
                    for i, input_file in enumerate(input_file_tuple):
433
                        if input_file is None:
434
                            # file/page not found in this file grp
435
                            continue
436
                        input_files[i] = input_file
437
                        if not self.download:
438
                            continue
439
                        try:
440
                            input_files[i] = self.workspace.download_file(input_file)
441
                        except (ValueError, FileNotFoundError, HTTPError) as e:
442
                            self._base_logger.error(repr(e))
443
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
444
                    # FIXME: differentiate error cases in various ways:
445
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
446
                    # - transient (I/O or OOM) error → maybe sleep, retry
447
                    # - persistent (data) error → skip / dummy / raise
448
                    try:
449
                        self.process_page_file(*input_files)
450
                    except Exception as err:
451
                        # we have to be broad here, but want to exclude NotImplementedError
452
                        if isinstance(err, NotImplementedError):
453
                            raise err
454
                        if isinstance(err, FileExistsError):
455
                            if config.OCRD_EXISTING_OUTPUT == 'ABORT':
456
                                raise err
457
                            if config.OCRD_EXISTING_OUTPUT == 'SKIP':
458
                                continue
459
                            if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
460
                                # too late here, must not happen
461
                                raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
462
                        # FIXME: re-usable/actionable logging
463
                        self._base_logger.exception(f"Failure on page {page_id}: {err}")
464
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
465
                            raise err
466
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
467
                            continue
468
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
469
                            self._copy_page_file(input_files[0])
470
                        else:
471
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
472
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
473
            except NotImplementedError:
474
                # fall back to deprecated method
475
                self.process()
476
477
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
478
        """
479
        Copy the given ``input_file`` of the :py:data:`workspace`,
480
        representing one physical page (passed as one opened
481
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
482
        and add it as if it was a processing result.
483
        """
484
        input_pcgts : OcrdPage
485
        assert isinstance(input_file, get_args(OcrdFileType))
486
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
487
        try:
488
            input_pcgts = page_from_file(input_file)
489
        except ValueError as err:
490
            # not PAGE and not an image to generate PAGE for
491
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
492
            return
493
        output_file_id = make_file_id(input_file, self.output_file_grp)
494
        input_pcgts.set_pcGtsId(output_file_id)
495
        self.add_metadata(input_pcgts)
496
        self.workspace.add_file(file_id=output_file_id,
497
                                file_grp=self.output_file_grp,
498
                                page_id=input_file.pageId,
499
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
500
                                mimetype=MIMETYPE_PAGE,
501
                                content=to_xml(input_pcgts),
502
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
503
        )
504
505
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
506
        """
507
        Process the given ``input_files`` of the :py:data:`workspace`,
508
        representing one physical page (passed as one opened
509
        :py:class:`.OcrdFile` per input fileGrp)
510
        under the given :py:data:`.parameter`, and make sure the
511
        results get added accordingly.
512
513
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
514
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
515
        """
516
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
517
        assert isinstance(input_files[0], get_args(OcrdFileType))
518
        page_id = input_files[0].pageId
519
        for i, input_file in enumerate(input_files):
520
            assert isinstance(input_file, get_args(OcrdFileType))
521
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
522
            try:
523
                page_ = page_from_file(input_file)
524
                assert isinstance(page_, OcrdPage)
525
                input_pcgts[i] = page_
526
            except ValueError as err:
527
                # not PAGE and not an image to generate PAGE for
528
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
529
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
530
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
531
        for image_result in result.images:
532
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
533
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
534
            if isinstance(image_result.alternative_image, PageType):
535
                image_result.alternative_image.set_imageFilename(image_file_path)
536
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
537
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
538
            elif isinstance(image_result.alternative_image, AlternativeImageType):
539
                image_result.alternative_image.set_filename(image_file_path)
540
            else:
541
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
542
                                 f"{type(image_result.alternative_image)}")
543
            self.workspace.save_image_file(
544
                image_result.pil,
545
                image_file_id,
546
                self.output_file_grp,
547
                page_id=page_id,
548
                file_path=image_file_path,
549
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
550
            )
551
        result.pcgts.set_pcGtsId(output_file_id)
552
        self.add_metadata(result.pcgts)
553
        self.workspace.add_file(file_id=output_file_id,
554
                                file_grp=self.output_file_grp,
555
                                page_id=page_id,
556
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
557
                                mimetype=MIMETYPE_PAGE,
558
                                content=to_xml(result.pcgts),
559
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
560
        )
561
562
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
563
        """
564
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
565
        representing one physical page (passed as one parsed
566
        :py:class:`.OcrdPage` per input fileGrp)
567
        under the given :py:data:`.parameter`, and return the
568
        resulting :py:class:`.OcrdPageResult`.
569
570
        Optionally, add to the ``images`` attribute of the resulting
571
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
572
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
573
        ``file_id_suffix`` (used for generating IDs of the saved image) and
574
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
575
        for setting the filename of the saved image).
576
577
        (This contains the main functionality and must be overridden by subclasses,
578
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
579
        """
580
        raise NotImplementedError()
581
582
    def add_metadata(self, pcgts: OcrdPage) -> None:
583
        """
584
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
585
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
586
        """
587
        metadata_obj = pcgts.get_Metadata()
588
        assert metadata_obj is not None
589
        metadata_obj.add_MetadataItem(
590
                MetadataItemType(type_="processingStep",
591
                    name=self.ocrd_tool['steps'][0],
592
                    value=self.ocrd_tool['executable'],
593
                    Labels=[LabelsType(
594
                        externalModel="ocrd-tool",
595
                        externalId="parameters",
596
                        Label=[LabelType(type_=name,
597
                                         value=self.parameter[name])
598
                               for name in self.parameter.keys()]),
599
                            LabelsType(
600
                        externalModel="ocrd-tool",
601
                        externalId="version",
602
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
603
                                         value=self.version),
604
                               LabelType(type_='ocrd/core',
605
                                         value=OCRD_VERSION)])
606
                    ]))
607
608
    def resolve_resource(self, val):
609
        """
610
        Resolve a resource name to an absolute file path with the algorithm in
611
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
612
613
        Args:
614
            val (string): resource value to resolve
615
        """
616
        executable = self.ocrd_tool['executable']
617
        if exists(val):
618
            self._base_logger.debug("Resolved to absolute path %s" % val)
619
            return val
620
        # FIXME: remove once workspace arg / old_pwd is gone:
621
        if hasattr(self, 'old_pwd'):
622
            cwd = self.old_pwd
623
        else:
624
            cwd = getcwd()
625
        ret = [cand for cand in list_resource_candidates(executable, val,
626
                                                         cwd=cwd, moduled=self.moduledir)
627
               if exists(cand)]
628
        if ret:
629
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
630
            return ret[0]
631
        raise ResourceNotFoundError(val, executable)
632
633
    def show_resource(self, val):
634
        """
635
        Resolve a resource name to a file path with the algorithm in
636
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
637
        then print its contents to stdout.
638
639
        Args:
640
            val (string): resource value to show
641
        """
642
        res_fname = self.resolve_resource(val)
643
        fpath = Path(res_fname)
644
        if fpath.is_dir():
645
            with pushd_popd(fpath):
646
                fileobj = io.BytesIO()
647
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
648
                    tarball.add('.')
649
                fileobj.seek(0)
650
                copyfileobj(fileobj, sys.stdout.buffer)
651
        else:
652
            sys.stdout.buffer.write(fpath.read_bytes())
653
654
    def list_all_resources(self):
655
        """
656
        List all resources found in the filesystem and matching content-type by filename suffix
657
        """
658
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
659
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
660
            res = Path(res)
661
            if not '*/*' in mimetypes:
662
                if res.is_dir() and not 'text/directory' in mimetypes:
663
                    continue
664
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
665
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
666
                                             for mime in mimetypes):
667
                    continue
668
            yield res
669
670
    @property
671
    def module(self):
672
        """
673
        The top-level module this processor belongs to.
674
        """
675
        # find shortest prefix path that is not just a namespace package
676
        fqname = ''
677
        for name in self.__module__.split('.'):
678
            if fqname:
679
                fqname += '.'
680
            fqname += name
681
            if getattr(sys.modules[fqname], '__file__', None):
682
                return fqname
683
        # fall-back
684
        return self.__module__
685
686
    @property
687
    def moduledir(self):
688
        """
689
        The filesystem path of the module directory.
690
        """
691
        return resource_filename(self.module, '.')
692
693
    @property
694
    def input_files(self):
695
        """
696
        List the input files (for single-valued :py:attr:`input_file_grp`).
697
698
        For each physical page:
699
700
        - If there is a single PAGE-XML for the page, take it (and forget about all
701
          other files for that page)
702
        - Else if there is a single image file, take it (and forget about all other
703
          files for that page)
704
        - Otherwise raise an error (complaining that only PAGE-XML warrants
705
          having multiple images for a single page)
706
707
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
708
709
        Returns:
710
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
711
        """
712
        if not self.input_file_grp:
713
            raise ValueError("Processor is missing input fileGrp")
714
        ret = self.zip_input_files(mimetype=None, on_error='abort')
715
        if not ret:
716
            return []
717
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
718
        return [tuples[0] for tuples in ret]
719
720
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
721
        """
722
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
723
724
        Processors that expect/need multiple input file groups,
725
        cannot use :py:data:`input_files`. They must align (zip) input files
726
        across pages. This includes the case where not all pages
727
        are equally present in all file groups. It also requires
728
        making a consistent selection if there are multiple files
729
        per page.
730
731
        Following the OCR-D functional model, this function tries to
732
        find a single PAGE file per page, or fall back to a single
733
        image file per page. In either case, multiple matches per page
734
        are an error (see error handling below).
735
        This default behaviour can be changed by using a fixed MIME
736
        type filter via :py:attr:`mimetype`. But still, multiple matching
737
        files per page are an error.
738
739
        Single-page multiple-file errors are handled according to
740
        :py:attr:`on_error`:
741
742
        - if ``skip``, then the page for the respective fileGrp will be
743
          silently skipped (as if there was no match at all)
744
        - if ``first``, then the first matching file for the page will be
745
          silently selected (as if the first was the only match)
746
        - if ``last``, then the last matching file for the page will be
747
          silently selected (as if the last was the only match)
748
        - if ``abort``, then an exception will be raised.
749
750
        Multiple matches for PAGE-XML will always raise an exception.
751
752
        Keyword Args:
753
             require_first (boolean): If true, then skip a page entirely
754
                 whenever it is not available in the first input `fileGrp`.
755
             on_error (string): How to handle multiple file matches per page.
756
             mimetype (string): If not `None`, filter by the specified MIME
757
                 type (literal or regex prefixed by `//`). Otherwise prefer
758
                 PAGE or image.
759
        Returns:
760
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
761
        """
762
        if not self.input_file_grp:
763
            raise ValueError("Processor is missing input fileGrp")
764
765
        ifgs = self.input_file_grp.split(",")
766
        # Iterating over all files repeatedly may seem inefficient at first sight,
767
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
768
        # can actually be much more costly than traversing the ltree.
769
        # This might depend on the number of pages vs number of fileGrps.
770
771
        pages = {}
772
        for i, ifg in enumerate(ifgs):
773
            files_ = sorted(self.workspace.mets.find_all_files(
774
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
775
                                # sort by MIME type so PAGE comes before images
776
                                key=lambda file_: file_.mimetype)
777
            for file_ in files_:
778
                if not file_.pageId:
779
                    # ignore document-global files
780
                    continue
781
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
782
                if ift[i]:
783
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
784
                    # fileGrp has multiple files for this page ID
785
                    if mimetype:
786
                        # filter was active, this must not happen
787
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
788
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
789 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
790
                            ift[i] = None
791
                        elif on_error == 'first':
792
                            pass # keep first match
793
                        elif on_error == 'last':
794
                            ift[i] = file_
795
                        elif on_error == 'abort':
796
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
797
                        else:
798
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
799
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
800
                          file_.mimetype != MIMETYPE_PAGE):
801
                        pass # keep PAGE match
802
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
803
                          file_.mimetype == MIMETYPE_PAGE):
804
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
805
                    else:
806
                        # filter was inactive but no PAGE is in control, this must not happen
807
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
808
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
809 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
810
                            ift[i] = None
811
                        elif on_error == 'first':
812
                            pass # keep first match
813
                        elif on_error == 'last':
814
                            ift[i] = file_
815
                        elif on_error == 'abort':
816
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
817
                        else:
818
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
819
                else:
820
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
821
                    ift[i] = file_
822
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
823
        if self.page_id and not any(pages):
824
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
825
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
826
        ifts = []
827
        for page, ifiles in pages.items():
828
            for i, ifg in enumerate(ifgs):
829
                if not ifiles[i]:
830
                    # could be from non-unique with on_error=skip or from true gap
831
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
832
                    if config.OCRD_MISSING_INPUT == 'abort':
833
                        raise MissingInputFile(ifg, page, mimetype)
834
            if not any(ifiles):
835
                # must be from non-unique with on_error=skip
836
                self._base_logger.warning(f'Found no files for {page} - skipping')
837
                continue
838
            if ifiles[0] or not require_first:
839
                ifts.append(tuple(ifiles))
840
        return ifts
841