Passed
Pull Request — master (#1240)
by
unknown
05:39
created

ocrd.processor.base.Processor.metadata()   A

Complexity

Conditions 2

Size

Total Lines 21
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 21
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import Any, List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
28
from click import wrap_text
29
from deprecated import deprecated
30
from requests import HTTPError
31
32
from ..workspace import Workspace
33
from ..mets_server import ClientSideOcrdMets
34
from ocrd_models.ocrd_file import OcrdFileType
35
from .ocrd_page_result import OcrdPageResult
36
from ocrd_utils import (
37
    VERSION as OCRD_VERSION,
38
    MIMETYPE_PAGE,
39
    MIME_TO_EXT,
40
    config,
41
    getLogger,
42
    list_resource_candidates,
43
    pushd_popd,
44
    list_all_resources,
45
    get_processor_resource_types,
46
    resource_filename,
47
    parse_json_file_with_comments,
48
    make_file_id,
49
    deprecation_warning
50
)
51
from ocrd_validators import ParameterValidator
52
from ocrd_models.ocrd_page import (
53
    PageType,
54
    AlternativeImageType,
55
    MetadataItemType,
56
    LabelType,
57
    LabelsType,
58
    OcrdPage,
59
    to_xml,
60
)
61
from ocrd_modelfactory import page_from_file
62
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
63
64
# XXX imports must remain for backwards-compatibility
65
from .helpers import run_cli, run_processor # pylint: disable=unused-import
66
67
68
class ResourceNotFoundError(FileNotFoundError):
69
    """
70
    An exception signifying the requested processor resource
71
    cannot be resolved.
72
    """
73
    def __init__(self, name, executable):
74
        self.name = name
75
        self.executable = executable
76
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
77
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
78
        super().__init__(self.message)
79
80
class NonUniqueInputFile(ValueError):
81
    """
82
    An exception signifying the specified fileGrp / pageId / mimetype
83
    selector yields multiple PAGE files, or no PAGE files but multiple images,
84
    or multiple files of that mimetype.
85
    """
86
    def __init__(self, fileGrp, pageId, mimetype):
87
        self.fileGrp = fileGrp
88
        self.pageId = pageId
89
        self.mimetype = mimetype
90
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
91
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
92
        super().__init__(self.message)
93
94
class MissingInputFile(ValueError):
95
    """
96
    An exception signifying the specified fileGrp / pageId / mimetype
97
    selector yields no PAGE files, or no PAGE and no image files,
98
    or no files of that mimetype.
99
    """
100
    def __init__(self, fileGrp, pageId, mimetype):
101
        self.fileGrp = fileGrp
102
        self.pageId = pageId
103
        self.mimetype = mimetype
104
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
105
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
        super().__init__(self.message)
107
108
class Processor():
109
    """
110
    A processor is a tool that implements the uniform OCR-D
111
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
112
113
    That is, it executes a single workflow step, or a combination of workflow steps,
114
    on the workspace (represented by local METS). It reads input files for all or selected
115
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
116
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
117
    parameters.
118
    """
119
120
    max_instances : int = -1
121
    """
122
    maximum number of cached instances (ignored if negative), to be applied on top of
123
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
124
125
    (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
    """
127
128
    max_workers : int = -1
129
    """
130
    maximum number of processor threads for page-parallel processing (ignored if negative),
131
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
    whatever is smaller).
133
134
    (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
    - at once, or if your class is not thread-safe.)
136
    """
137
138
    max_page_seconds : int = -1
139
    """
140
    maximum number of seconds may be spent processing a single page (ignored if negative),
141
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
    (i.e. whatever is smaller).
143
144
    (Override this if you know how costly this processor may be, irrespective of image size
145
    or complexity of the page.)
146
    """
147
148
    @property
149
    def metadata_filename(self) -> str:
150
        """
151
        Relative location of the ``ocrd-tool.json`` file inside the package.
152
153
        Used by :py:data:`metadata_location`.
154
155
        (Override if ``ocrd-tool.json`` is not in the root of the module,
156
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
157
        """
158
        return 'ocrd-tool.json'
159
160
    @cached_property
161
    def metadata_location(self) -> Path:
162
        """
163
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
164
165
        Used by :py:data:`metadata_rawdict`.
166
167
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
        """
169
        module = inspect.getmodule(self)
170
        module_tokens = module.__package__.split('.')
171
        # for namespace packages, we cannot just use the first token
172
        for i in range(len(module_tokens)):
173
            prefix = '.'.join(module_tokens[:i + 1])
174
            if sys.modules[prefix].__spec__.has_location:
175
                return resource_filename(prefix, self.metadata_filename)
176
        raise Exception("cannot find top-level module prefix for %s", module.__package__)
177
178
    @cached_property
179
    def metadata_rawdict(self) -> dict:
180
        """
181
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
182
183
        Used by :py:data:`metadata`.
184
185
        (Override if ``ocrd-tool.json`` is not in a file.)
186
        """
187
        return parse_json_file_with_comments(self.metadata_location)
188
189
    @cached_property
190
    def metadata(self) -> dict:
191
        """
192
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
193
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
194
195
        After deserialisation, it also gets validated against the
196
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
197
        expanded.
198
199
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
200
201
        (Override if you want to provide metadata programmatically instead of a
202
        JSON file.)
203
        """
204
        metadata = self.metadata_rawdict
205
        report = OcrdToolValidator.validate(metadata)
206
        if not report.is_valid:
207
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
208
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
209
        return metadata
210
211
    @cached_property
212
    def version(self) -> str:
213
        """
214
        The program version of the package.
215
        Usually the ``version`` part of :py:data:`metadata`.
216
217
        (Override if you do not want to use :py:data:`metadata` lookup
218
        mechanism.)
219
        """
220
        return self.metadata['version']
221
222
    @cached_property
223
    def executable(self) -> str:
224
        """
225
        The executable name of this processor tool. Taken from the runtime
226
        filename.
227
228
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
229
230
        (Override if your entry-point name deviates from the ``executable``
231
        name, or the processor gets instantiated from another runtime.)
232
        """
233
        return os.path.basename(inspect.stack()[-1].filename)
234
235
    @cached_property
236
    def ocrd_tool(self) -> dict:
237
        """
238
        The ``ocrd-tool.json`` dict contents of this processor tool.
239
        Usually the :py:data:`executable` key of the ``tools`` part
240
        of :py:data:`metadata`.
241
242
        (Override if you do not want to use :py:data:`metadata` lookup
243
        mechanism.)
244
        """
245
        return self.metadata['tools'][self.executable]
246
247
    @property
248
    def parameter(self) -> Optional[dict]:
249
        """the runtime parameter dict to be used by this processor"""
250
        if hasattr(self, '_parameter'):
251
            return self._parameter
252
        return None
253
254
    @parameter.setter
255
    def parameter(self, parameter : dict) -> None:
256
        if self.parameter is not None:
257
            self.shutdown()
258
        parameterValidator = ParameterValidator(self.ocrd_tool)
259
        report = parameterValidator.validate(parameter)
260
        if not report.is_valid:
261
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
262
        # make parameter dict read-only
263
        self._parameter = frozendict(parameter)
264
        # (re-)run setup to load models etc
265
        self.setup()
266
267
    def __init__(
268
            self,
269
            # FIXME: remove in favor of process_workspace(workspace)
270
            workspace : Optional[Workspace],
271
            ocrd_tool=None,
272
            parameter=None,
273
            input_file_grp=None,
274
            output_file_grp=None,
275
            page_id=None,
276
            download_files=config.OCRD_DOWNLOAD_INPUT,
277
            version=None
278
    ):
279
        """
280
        Instantiate, but do not setup (neither for processing nor other usage).
281
        If given, do parse and validate :py:data:`.parameter`.
282
283
        Args:
284
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
285
                 If not ``None``, then `chdir` to that directory.
286
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
287
                 before processing.
288
        Keyword Args:
289
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
290
                 Can be ``None`` even for processing, but then needs to be set before running.
291
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
292
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
293
                 before processing.
294
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
295
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
296
                 before processing.
297
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
298
                 (or empty for all pages). \
299
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
300
                 before processing.
301
             download_files (boolean): Whether input files will be downloaded prior to processing, \
302
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
303
        """
304
        if ocrd_tool is not None:
305
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
306
                                "use or override metadata/executable/ocrd-tool properties instead")
307
            self.ocrd_tool = ocrd_tool
308
            self.executable = ocrd_tool['executable']
309
        if version is not None:
310
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
311
                                "use or override metadata/version properties instead")
312
            self.version = version
313
        if workspace is not None:
314
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
315
                                "is deprecated - pass as argument to process_workspace instead")
316
            self.workspace = workspace
317
            self.old_pwd = getcwd()
318
            os.chdir(self.workspace.directory)
319
        if input_file_grp is not None:
320
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
321
                                "is deprecated - pass as argument to process_workspace instead")
322
            self.input_file_grp = input_file_grp
323
        if output_file_grp is not None:
324
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
325
                                "is deprecated - pass as argument to process_workspace instead")
326
            self.output_file_grp = output_file_grp
327
        if page_id is not None:
328
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
329
                                "is deprecated - pass as argument to process_workspace instead")
330
            self.page_id = page_id or None
331
        self.download = download_files
332
        #: The logger to be used by processor implementations.
333
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
334
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
335
        self._base_logger = getLogger('ocrd.processor.base')
336
        if parameter is not None:
337
            self.parameter = parameter
338
        # ensure that shutdown gets called at destruction
339
        self._finalizer = weakref.finalize(self, self.shutdown)
340
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
341
        setattr(self, 'process',
342
                deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
343
344
    def show_help(self, subcommand=None):
345
        """
346
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
347
        parameters and docstrings.
348
        """
349
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
350
351
    def show_version(self):
352
        """
353
        Print information on this processor's version and OCR-D version.
354
        """
355
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
356
357
    def verify(self):
358
        """
359
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
360
        """
361
        # verify input and output file groups in parameters
362
        assert self.input_file_grp is not None
363
        assert self.output_file_grp is not None
364
        input_file_grps = self.input_file_grp.split(',')
365
        output_file_grps = self.output_file_grp.split(',')
366
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
367
            if isinstance(spec, int):
368
                if spec > 0:
369
                    assert len(grps) == spec, msg % (len(grps), str(spec))
370
            else:
371
                assert isinstance(spec, list)
372
                minimum = spec[0]
373
                maximum = spec[1]
374
                if minimum > 0:
375
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
376
                if maximum > 0:
377
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
378
        # FIXME: enforce unconditionally as soon as grace period for deprecation is over
379
        if 'input_file_grp_cardinality' in self.ocrd_tool:
380
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
381
                                        "Unexpected number of input file groups %d vs %s")
382
        if 'output_file_grp_cardinality' in self.ocrd_tool:
383
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
384
                                        "Unexpected number of output file groups %d vs %s")
385
        # verify input and output file groups in METS
386
        for input_file_grp in input_file_grps:
387
            assert input_file_grp in self.workspace.mets.file_groups, \
388
                f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
389
        for output_file_grp in output_file_grps:
390
            assert output_file_grp not in self.workspace.mets.file_groups \
391
                or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
392
                or not any(self.workspace.mets.find_files(
393
                    pageId=self.page_id, fileGrp=output_file_grp)), \
394
                    f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
395
        # keep this for backwards compatibility:
396
        return True
397
398
    def dump_json(self):
399
        """
400
        Print :py:attr:`ocrd_tool` on stdout.
401
        """
402
        print(json.dumps(self.ocrd_tool, indent=True))
403
404
    def dump_module_dir(self):
405
        """
406
        Print :py:attr:`moduledir` on stdout.
407
        """
408
        print(self.moduledir)
409
410
    def list_resources(self):
411
        """
412
        Find all installed resource files in the search paths and print their path names.
413
        """
414
        for res in self.list_all_resources():
415
            print(res)
416
417
    def setup(self) -> None:
418
        """
419
        Prepare the processor for actual data processing,
420
        prior to changing to the workspace directory but
421
        after parsing parameters.
422
423
        (Override this to load models into memory etc.)
424
        """
425
        pass
426
427
    def shutdown(self) -> None:
428
        """
429
        Bring down the processor after data processing,
430
        after to changing back from the workspace directory but
431
        before exiting (or setting up with different parameters).
432
433
        (Override this to unload models from memory etc.)
434
        """
435
        pass
436
437
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
438
    def process(self) -> None:
439
        """
440
        Process all files of the :py:data:`workspace`
441
        from the given :py:data:`input_file_grp`
442
        to the given :py:data:`output_file_grp`
443
        for the given :py:data:`page_id` (or all pages)
444
        under the given :py:data:`parameter`.
445
446
        (This contains the main functionality and needs to be
447
        overridden by subclasses.)
448
        """
449
        raise NotImplementedError()
450
451
    def process_workspace(self, workspace: Workspace) -> None:
452
        """
453
        Process all files of the given ``workspace``,
454
        from the given :py:data:`input_file_grp`
455
        to the given :py:data:`output_file_grp`
456
        for the given :py:data:`page_id` (or all pages)
457
        under the given :py:data:`parameter`.
458
459
        (This will iterate over pages and files, calling
460
        :py:meth:`.process_page_file` and handling exceptions.
461
        It should be overridden by subclasses to handle cases
462
        like post-processing or computation across pages.)
463
        """
464
        with pushd_popd(workspace.directory):
465
            self.workspace = workspace
466
            self.verify()
467
            try:
468
                nr_succeeded = 0
469
                nr_skipped = 0
470
                nr_copied = 0
471
472
                # set up multithreading
473
                max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
474
                if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
475
                    self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
476
                    max_workers = self.max_workers
477
                if max_workers > 1:
478
                    assert isinstance(workspace.mets, ClientSideOcrdMets), \
479
                        "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
480
                max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
481
                if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
482
                    self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
483
                    max_seconds = self.max_page_seconds
484
                executor = ThreadPoolExecutor(
485
                    max_workers=max_workers or 1,
486
                    thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
487
                )
488
                self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
489
                tasks = {}
490
491
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
492
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
493
                    page_id = next(input_file.pageId
494
                                   for input_file in input_file_tuple
495
                                   if input_file)
496
                    self._base_logger.info(f"preparing page {page_id}")
497
                    for i, input_file in enumerate(input_file_tuple):
498
                        if input_file is None:
499
                            # file/page not found in this file grp
500
                            continue
501
                        input_files[i] = input_file
502
                        if not self.download:
503
                            continue
504
                        try:
505
                            input_files[i] = self.workspace.download_file(input_file)
506
                        except (ValueError, FileNotFoundError, HTTPError) as e:
507
                            self._base_logger.error(repr(e))
508
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
509
                    # process page
510
                    tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
511
                self._base_logger.debug("submitted %d processing tasks", len(tasks))
512
513
                for task in tasks:
514
                    # wait for results, handle errors
515
                    page_id, input_files = tasks[task]
516
                    # FIXME: differentiate error cases in various ways:
517
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
518
                    # - transient (I/O or OOM) error → maybe sleep, retry
519
                    # - persistent (data) error → skip / dummy / raise
520
                    try:
521
                        self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
522
                        task.result(timeout=max_seconds or None)
523
                        nr_succeeded += 1
524
                    # exclude NotImplementedError, so we can try process() below
525
                    except NotImplementedError:
526
                        raise
527
                    # handle input failures separately
528
                    except FileExistsError as err:
529
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
530
                            raise err
531
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
532
                            continue
533
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
534
                            # too late here, must not happen
535
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
536
                    # broad coverage of output failures (including TimeoutError)
537
                    except (Exception, TimeoutError) as err:
538
                        # FIXME: add re-usable/actionable logging
539
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
540
                            self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
541
                            raise err
542
                        self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
543
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
544
                            nr_skipped += 1
545
                            continue
546
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
547
                            self._copy_page_file(input_files[0])
548
                            nr_copied += 1
549
                        else:
550
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
551
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
552
553
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
554
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
555
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
556
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
557
                executor.shutdown()
558
559
            except NotImplementedError:
560
                # fall back to deprecated method
561
                try:
562
                    self.process()
563
                except Exception as err:
564
                    # suppress the NotImplementedError context
565
                    raise err from None
566
567
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
568
        """
569
        Copy the given ``input_file`` of the :py:data:`workspace`,
570
        representing one physical page (passed as one opened
571
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
572
        and add it as if it was a processing result.
573
        """
574
        input_pcgts : OcrdPage
575
        assert isinstance(input_file, get_args(OcrdFileType))
576
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
577
        try:
578
            input_pcgts = page_from_file(input_file)
579
        except ValueError as err:
580
            # not PAGE and not an image to generate PAGE for
581
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
582
            return
583
        output_file_id = make_file_id(input_file, self.output_file_grp)
584
        input_pcgts.set_pcGtsId(output_file_id)
585
        self.add_metadata(input_pcgts)
586
        self.workspace.add_file(
587
            file_id=output_file_id,
588
            file_grp=self.output_file_grp,
589
            page_id=input_file.pageId,
590
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
591
            mimetype=MIMETYPE_PAGE,
592
            content=to_xml(input_pcgts),
593
        )
594
595
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
596
        """
597
        Process the given ``input_files`` of the :py:data:`workspace`,
598
        representing one physical page (passed as one opened
599
        :py:class:`.OcrdFile` per input fileGrp)
600
        under the given :py:data:`.parameter`, and make sure the
601
        results get added accordingly.
602
603
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
604
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
605
        """
606
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
607
        assert isinstance(input_files[0], get_args(OcrdFileType))
608
        page_id = input_files[0].pageId
609
        self._base_logger.info("processing page %s", page_id)
610
        for i, input_file in enumerate(input_files):
611
            assert isinstance(input_file, get_args(OcrdFileType))
612
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
613
            try:
614
                page_ = page_from_file(input_file)
615
                assert isinstance(page_, OcrdPage)
616
                input_pcgts[i] = page_
617
            except ValueError as err:
618
                # not PAGE and not an image to generate PAGE for
619
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
620
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
621
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
622
        for image_result in result.images:
623
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
624
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
625
            if isinstance(image_result.alternative_image, PageType):
626
                # special case: not an alternative image, but replacing the original image
627
                # (this is needed by certain processors when the original's coordinate system
628
                #  cannot or must not be kept)
629
                image_result.alternative_image.set_imageFilename(image_file_path)
630
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
631
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
632
            elif isinstance(image_result.alternative_image, AlternativeImageType):
633
                image_result.alternative_image.set_filename(image_file_path)
634
            elif image_result.alternative_image is None:
635
                pass # do not reference in PAGE result
636
            else:
637
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
638
                                 f"{type(image_result.alternative_image)}")
639
            self.workspace.save_image_file(
640
                image_result.pil,
641
                image_file_id,
642
                self.output_file_grp,
643
                page_id=page_id,
644
                file_path=image_file_path,
645
            )
646
        result.pcgts.set_pcGtsId(output_file_id)
647
        self.add_metadata(result.pcgts)
648
        self.workspace.add_file(
649
            file_id=output_file_id,
650
            file_grp=self.output_file_grp,
651
            page_id=page_id,
652
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
653
            mimetype=MIMETYPE_PAGE,
654
            content=to_xml(result.pcgts),
655
        )
656
657
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
658
        """
659
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
660
        representing one physical page (passed as one parsed
661
        :py:class:`.OcrdPage` per input fileGrp)
662
        under the given :py:data:`.parameter`, and return the
663
        resulting :py:class:`.OcrdPageResult`.
664
665
        Optionally, add to the ``images`` attribute of the resulting
666
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
667
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
668
        ``file_id_suffix`` (used for generating IDs of the saved image) and
669
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
670
        for setting the filename of the saved image).
671
672
        (This contains the main functionality and must be overridden by subclasses,
673
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
674
        """
675
        raise NotImplementedError()
676
677
    def add_metadata(self, pcgts: OcrdPage) -> None:
678
        """
679
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
680
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
681
        """
682
        metadata_obj = pcgts.get_Metadata()
683
        assert metadata_obj is not None
684
        metadata_obj.add_MetadataItem(
685
                MetadataItemType(type_="processingStep",
686
                    name=self.ocrd_tool['steps'][0],
687
                    value=self.ocrd_tool['executable'],
688
                    Labels=[LabelsType(
689
                        externalModel="ocrd-tool",
690
                        externalId="parameters",
691
                        Label=[LabelType(type_=name,
692
                                         value=self.parameter[name])
693
                               for name in self.parameter.keys()]),
694
                            LabelsType(
695
                        externalModel="ocrd-tool",
696
                        externalId="version",
697
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
698
                                         value=self.version),
699
                               LabelType(type_='ocrd/core',
700
                                         value=OCRD_VERSION)])
701
                    ]))
702
703
    def resolve_resource(self, val):
704
        """
705
        Resolve a resource name to an absolute file path with the algorithm in
706
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
707
708
        Args:
709
            val (string): resource value to resolve
710
        """
711
        executable = self.ocrd_tool['executable']
712
        if exists(val):
713
            self._base_logger.debug("Resolved to absolute path %s" % val)
714
            return val
715
        # FIXME: remove once workspace arg / old_pwd is gone:
716
        if hasattr(self, 'old_pwd'):
717
            cwd = self.old_pwd
718
        else:
719
            cwd = getcwd()
720
        ret = [cand for cand in list_resource_candidates(executable, val,
721
                                                         cwd=cwd, moduled=self.moduledir)
722
               if exists(cand)]
723
        if ret:
724
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
725
            return ret[0]
726
        raise ResourceNotFoundError(val, executable)
727
728
    def show_resource(self, val):
729
        """
730
        Resolve a resource name to a file path with the algorithm in
731
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
732
        then print its contents to stdout.
733
734
        Args:
735
            val (string): resource value to show
736
        """
737
        res_fname = self.resolve_resource(val)
738
        fpath = Path(res_fname)
739
        if fpath.is_dir():
740
            with pushd_popd(fpath):
741
                fileobj = io.BytesIO()
742
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
743
                    tarball.add('.')
744
                fileobj.seek(0)
745
                copyfileobj(fileobj, sys.stdout.buffer)
746
        else:
747
            sys.stdout.buffer.write(fpath.read_bytes())
748
749
    def list_all_resources(self):
750
        """
751
        List all resources found in the filesystem and matching content-type by filename suffix
752
        """
753
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
754
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
755
            res = Path(res)
756
            if not '*/*' in mimetypes:
757
                if res.is_dir() and not 'text/directory' in mimetypes:
758
                    continue
759
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
760
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
761
                                             for mime in mimetypes):
762
                    continue
763
            yield res
764
765
    @property
766
    def module(self):
767
        """
768
        The top-level module this processor belongs to.
769
        """
770
        # find shortest prefix path that is not just a namespace package
771
        fqname = ''
772
        for name in self.__module__.split('.'):
773
            if fqname:
774
                fqname += '.'
775
            fqname += name
776
            if getattr(sys.modules[fqname], '__file__', None):
777
                return fqname
778
        # fall-back
779
        return self.__module__
780
781
    @property
782
    def moduledir(self):
783
        """
784
        The filesystem path of the module directory.
785
        """
786
        return resource_filename(self.module, '.')
787
788
    @property
789
    def input_files(self):
790
        """
791
        List the input files (for single-valued :py:attr:`input_file_grp`).
792
793
        For each physical page:
794
795
        - If there is a single PAGE-XML for the page, take it (and forget about all
796
          other files for that page)
797
        - Else if there is a single image file, take it (and forget about all other
798
          files for that page)
799
        - Otherwise raise an error (complaining that only PAGE-XML warrants
800
          having multiple images for a single page)
801
802
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
803
804
        Returns:
805
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
806
        """
807
        if not self.input_file_grp:
808
            raise ValueError("Processor is missing input fileGrp")
809
        ret = self.zip_input_files(mimetype=None, on_error='abort')
810
        if not ret:
811
            return []
812
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
813
        return [tuples[0] for tuples in ret]
814
815
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
816
        """
817
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
818
819
        Processors that expect/need multiple input file groups,
820
        cannot use :py:data:`input_files`. They must align (zip) input files
821
        across pages. This includes the case where not all pages
822
        are equally present in all file groups. It also requires
823
        making a consistent selection if there are multiple files
824
        per page.
825
826
        Following the OCR-D functional model, this function tries to
827
        find a single PAGE file per page, or fall back to a single
828
        image file per page. In either case, multiple matches per page
829
        are an error (see error handling below).
830
        This default behaviour can be changed by using a fixed MIME
831
        type filter via :py:attr:`mimetype`. But still, multiple matching
832
        files per page are an error.
833
834
        Single-page multiple-file errors are handled according to
835
        :py:attr:`on_error`:
836
837
        - if ``skip``, then the page for the respective fileGrp will be
838
          silently skipped (as if there was no match at all)
839
        - if ``first``, then the first matching file for the page will be
840
          silently selected (as if the first was the only match)
841
        - if ``last``, then the last matching file for the page will be
842
          silently selected (as if the last was the only match)
843
        - if ``abort``, then an exception will be raised.
844
845
        Multiple matches for PAGE-XML will always raise an exception.
846
847
        Keyword Args:
848
             require_first (boolean): If true, then skip a page entirely
849
                 whenever it is not available in the first input `fileGrp`.
850
             on_error (string): How to handle multiple file matches per page.
851
             mimetype (string): If not `None`, filter by the specified MIME
852
                 type (literal or regex prefixed by `//`). Otherwise prefer
853
                 PAGE or image.
854
        Returns:
855
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
856
        """
857
        if not self.input_file_grp:
858
            raise ValueError("Processor is missing input fileGrp")
859
860
        ifgs = self.input_file_grp.split(",")
861
        # Iterating over all files repeatedly may seem inefficient at first sight,
862
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
863
        # can actually be much more costly than traversing the ltree.
864
        # This might depend on the number of pages vs number of fileGrps.
865
866
        pages = {}
867
        for i, ifg in enumerate(ifgs):
868
            files_ = sorted(self.workspace.mets.find_all_files(
869
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
870
                                # sort by MIME type so PAGE comes before images
871
                                key=lambda file_: file_.mimetype)
872
            for file_ in files_:
873
                if not file_.pageId:
874
                    # ignore document-global files
875
                    continue
876
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
877
                if ift[i]:
878
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
879
                    # fileGrp has multiple files for this page ID
880
                    if mimetype:
881
                        # filter was active, this must not happen
882
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
883
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
884 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
885
                            ift[i] = None
886
                        elif on_error == 'first':
887
                            pass # keep first match
888
                        elif on_error == 'last':
889
                            ift[i] = file_
890
                        elif on_error == 'abort':
891
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
892
                        else:
893
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
894
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
895
                          file_.mimetype != MIMETYPE_PAGE):
896
                        pass # keep PAGE match
897
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
898
                          file_.mimetype == MIMETYPE_PAGE):
899
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
900
                    else:
901
                        # filter was inactive but no PAGE is in control, this must not happen
902
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
903
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
904 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
905
                            ift[i] = None
906
                        elif on_error == 'first':
907
                            pass # keep first match
908
                        elif on_error == 'last':
909
                            ift[i] = file_
910
                        elif on_error == 'abort':
911
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
912
                        else:
913
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
914
                else:
915
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
916
                    ift[i] = file_
917
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
918
        if self.page_id and not any(pages):
919
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
920
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
921
        ifts = []
922
        for page, ifiles in pages.items():
923
            for i, ifg in enumerate(ifgs):
924
                if not ifiles[i]:
925
                    # could be from non-unique with on_error=skip or from true gap
926
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
927
                    if config.OCRD_MISSING_INPUT == 'abort':
928
                        raise MissingInputFile(ifg, page, mimetype)
929
            if not any(ifiles):
930
                # must be from non-unique with on_error=skip
931
                self._base_logger.warning(f'Found no files for {page} - skipping')
932
                continue
933
            if ifiles[0] or not require_first:
934
                ifts.append(tuple(ifiles))
935
        return ifts
936
937
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
938
    """Generate a string describing the full CLI of this processor including params.
939
940
    Args:
941
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
942
         processor_instance (object, optional): the processor implementation
943
             (for adding any module/class/function docstrings)
944
        subcommand (string): 'worker' or 'server'
945
    """
946
    doc_help = ''
947
    if processor_instance:
948
        module = inspect.getmodule(processor_instance)
949
        if module and module.__doc__:
950
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
951
        if processor_instance.__doc__:
952
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
953
        # Try to find the most concrete docstring among the various methods that an implementation
954
        # could overload, first serving.
955
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
956
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
957
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
958
            instance_method = getattr(processor_instance, method)
959
            superclass_method = getattr(Processor, method)
960
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
961
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
962
                break
963
        if doc_help:
964
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
965
                                          initial_indent='  > ',
966
                                          subsequent_indent='  > ',
967
                                          preserve_paragraphs=True)
968
    subcommands = '''\
969
    worker      Start a processing worker rather than do local processing
970
    server      Start a processor server rather than do local processing
971
'''
972
973
    processing_worker_options = '''\
974
  --queue                         The RabbitMQ server address in format
975
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
976
                                  [amqp://admin:admin@localhost:5672]
977
  --database                      The MongoDB server address in format
978
                                  "mongodb://{host}:{port}"
979
                                  [mongodb://localhost:27018]
980
  --log-filename                  Filename to redirect STDOUT/STDERR to,
981
                                  if specified.
982
'''
983
984
    processing_server_options = '''\
985
  --address                       The Processor server address in format
986
                                  "{host}:{port}"
987
  --database                      The MongoDB server address in format
988
                                  "mongodb://{host}:{port}"
989
                                  [mongodb://localhost:27018]
990
'''
991
992
    processing_options = '''\
993
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
994
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
995
  -I, --input-file-grp USE        File group(s) used as input
996
  -O, --output-file-grp USE       File group(s) used as output
997
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
998
  --overwrite                     Remove existing output pages/images
999
                                  (with "--page-id", remove only those).
1000
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
1001
  --debug                         Abort on any errors with full stack trace.
1002
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
1003
  --profile                       Enable profiling
1004
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
1005
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
1006
                                  or JSON file path
1007
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
1008
                                  taking precedence over --parameter
1009
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
1010
                                  If URL starts with http:// start an HTTP server there,
1011
                                  otherwise URL is a path to an on-demand-created unix socket
1012
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
1013
                                  Override log level globally [INFO]
1014
  --log-filename LOG-PATH         File to redirect stderr logging to (overriding ocrd_logging.conf).
1015
'''
1016
1017
    information_options = '''\
1018
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
1019
  -L, --list-resources            List names of processor resources
1020
  -J, --dump-json                 Dump tool description as JSON
1021
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
1022
  -h, --help                      Show this message
1023
  -V, --version                   Show version
1024
'''
1025
1026
    parameter_help = ''
1027
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1028
        parameter_help = '  NONE\n'
1029
    else:
1030
        def wrap(s):
1031
            return wrap_text(s, initial_indent=' '*3,
1032
                             subsequent_indent=' '*4,
1033
                             width=72, preserve_paragraphs=True)
1034
        for param_name, param in ocrd_tool['parameters'].items():
1035
            parameter_help += wrap('"%s" [%s%s]' % (
1036
                param_name,
1037
                param['type'],
1038
                ' - REQUIRED' if 'required' in param and param['required'] else
1039
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1040
            parameter_help += '\n ' + wrap(param['description'])
1041
            if 'enum' in param:
1042
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1043
            parameter_help += "\n"
1044
1045
    if not subcommand:
1046
        return f'''\
1047
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1048
1049
  {ocrd_tool['description']}{doc_help}
1050
1051
Subcommands:
1052
{subcommands}
1053
Options for processing:
1054
{processing_options}
1055
Options for information:
1056
{information_options}
1057
Parameters:
1058
{parameter_help}
1059
'''
1060
    elif subcommand == 'worker':
1061
        return f'''\
1062
Usage: {ocrd_tool['executable']} worker [OPTIONS]
1063
1064
  Run {ocrd_tool['executable']} as a processing worker.
1065
1066
  {ocrd_tool['description']}{doc_help}
1067
1068
Options:
1069
{processing_worker_options}
1070
'''
1071
    elif subcommand == 'server':
1072
        return f'''\
1073
Usage: {ocrd_tool['executable']} server [OPTIONS]
1074
1075
  Run {ocrd_tool['executable']} as a processor sever.
1076
1077
  {ocrd_tool['description']}{doc_help}
1078
1079
Options:
1080
{processing_server_options}
1081
'''
1082
    else:
1083
        pass
1084