Passed
Pull Request — master (#1240)
by
unknown
05:46
created

ocrd.processor.base.Processor.version()   A

Complexity

Conditions 1

Size

Total Lines 10
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 10
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import Any, List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
28
from click import wrap_text
29
from deprecated import deprecated
30
from requests import HTTPError
31
32
from ..workspace import Workspace
33
from ..mets_server import ClientSideOcrdMets
34
from ocrd_models.ocrd_file import OcrdFileType
35
from .ocrd_page_result import OcrdPageResult
36
from ocrd_utils import (
37
    VERSION as OCRD_VERSION,
38
    MIMETYPE_PAGE,
39
    MIME_TO_EXT,
40
    config,
41
    getLogger,
42
    list_resource_candidates,
43
    pushd_popd,
44
    list_all_resources,
45
    get_processor_resource_types,
46
    resource_filename,
47
    parse_json_file_with_comments,
48
    make_file_id,
49
    deprecation_warning
50
)
51
from ocrd_validators import ParameterValidator
52
from ocrd_models.ocrd_page import (
53
    PageType,
54
    AlternativeImageType,
55
    MetadataItemType,
56
    LabelType,
57
    LabelsType,
58
    OcrdPage,
59
    to_xml,
60
)
61
from ocrd_modelfactory import page_from_file
62
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
63
64
# XXX imports must remain for backwards-compatibility
65
from .helpers import run_cli, run_processor # pylint: disable=unused-import
66
67
68
class ResourceNotFoundError(FileNotFoundError):
69
    """
70
    An exception signifying the requested processor resource
71
    cannot be resolved.
72
    """
73
    def __init__(self, name, executable):
74
        self.name = name
75
        self.executable = executable
76
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
77
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
78
        super().__init__(self.message)
79
80
class NonUniqueInputFile(ValueError):
81
    """
82
    An exception signifying the specified fileGrp / pageId / mimetype
83
    selector yields multiple PAGE files, or no PAGE files but multiple images,
84
    or multiple files of that mimetype.
85
    """
86
    def __init__(self, fileGrp, pageId, mimetype):
87
        self.fileGrp = fileGrp
88
        self.pageId = pageId
89
        self.mimetype = mimetype
90
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
91
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
92
        super().__init__(self.message)
93
94
class MissingInputFile(ValueError):
95
    """
96
    An exception signifying the specified fileGrp / pageId / mimetype
97
    selector yields no PAGE files, or no PAGE and no image files,
98
    or no files of that mimetype.
99
    """
100
    def __init__(self, fileGrp, pageId, mimetype):
101
        self.fileGrp = fileGrp
102
        self.pageId = pageId
103
        self.mimetype = mimetype
104
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
105
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
        super().__init__(self.message)
107
108
class Processor():
109
    """
110
    A processor is a tool that implements the uniform OCR-D
111
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
112
113
    That is, it executes a single workflow step, or a combination of workflow steps,
114
    on the workspace (represented by local METS). It reads input files for all or selected
115
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
116
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
117
    parameters.
118
    """
119
120
    max_instances : int = -1
121
    """
122
    maximum number of cached instances (ignored if negative), to be applied on top of
123
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
124
125
    (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
    """
127
128
    max_workers : int = -1
129
    """
130
    maximum number of processor threads for page-parallel processing (ignored if negative),
131
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
    whatever is smaller).
133
134
    (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
    - at once, or if your class is not thread-safe.)
136
    """
137
138
    max_page_seconds : int = -1
139
    """
140
    maximum number of seconds may be spent processing a single page (ignored if negative),
141
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
    (i.e. whatever is smaller).
143
144
    (Override this if you know how costly this processor may be, irrespective of image size
145
    or complexity of the page.)
146
    """
147
148
    @property
149
    def metadata_filename(self) -> str:
150
        """
151
        Relative location of the ``ocrd-tool.json`` file inside the package.
152
153
        Used by :py:data:`metadata_location`.
154
155
        (Override if ``ocrd-tool.json`` is not in the root of the module,
156
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
157
        """
158
        return 'ocrd-tool.json'
159
160
    @cached_property
161
    def metadata_location(self) -> Path:
162
        """
163
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
164
165
        Used by :py:data:`metadata_rawdict`.
166
167
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
        """
169
        module = inspect.getmodule(self)
170
        module_tokens = module.__package__.split('.')
171
        # for namespace packages, we cannot just use the first token
172
        for i in range(len(module_tokens)):
173
            prefix = '.'.join(module_tokens[:i + 1])
174
            if sys.modules[prefix].__spec__.has_location:
175
                return resource_filename(prefix, self.metadata_filename)
176
        raise Exception("cannot find top-level module prefix for %s", module.__package__)
177
178
    @cached_property
179
    def metadata_rawdict(self) -> dict:
180
        """
181
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
182
183
        Used by :py:data:`metadata`.
184
185
        (Override if ``ocrd-tool.json`` is not in a file.)
186
        """
187
        return parse_json_file_with_comments(self.metadata_location)
188
189
    @cached_property
190
    def metadata(self) -> dict:
191
        """
192
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
193
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
194
195
        After deserialisation, it also gets validated against the
196
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
197
        expanded.
198
199
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
200
201
        (Override if you want to provide metadata programmatically instead of a
202
        JSON file.)
203
        """
204
        metadata = self.metadata_rawdict
205
        report = OcrdToolValidator.validate(metadata)
206
        if not report.is_valid:
207
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
208
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
209
        return metadata
210
211
    @cached_property
212
    def version(self) -> str:
213
        """
214
        The program version of the package.
215
        Usually the ``version`` part of :py:data:`metadata`.
216
217
        (Override if you do not want to use :py:data:`metadata` lookup
218
        mechanism.)
219
        """
220
        return self.metadata['version']
221
222
    @cached_property
223
    def executable(self) -> str:
224
        """
225
        The executable name of this processor tool. Taken from the runtime
226
        filename.
227
228
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
229
230
        (Override if your entry-point name deviates from the ``executable``
231
        name, or the processor gets instantiated from another runtime.)
232
        """
233
        return os.path.basename(inspect.stack()[-1].filename)
234
235
    @cached_property
236
    def ocrd_tool(self) -> dict:
237
        """
238
        The ``ocrd-tool.json`` dict contents of this processor tool.
239
        Usually the :py:data:`executable` key of the ``tools`` part
240
        of :py:data:`metadata`.
241
242
        (Override if you do not want to use :py:data:`metadata` lookup
243
        mechanism.)
244
        """
245
        return self.metadata['tools'][self.executable]
246
247
    @property
248
    def parameter(self) -> Optional[dict]:
249
        """the runtime parameter dict to be used by this processor"""
250
        if hasattr(self, '_parameter'):
251
            return self._parameter
252
        return None
253
254
    @parameter.setter
255
    def parameter(self, parameter : dict) -> None:
256
        if self.parameter is not None:
257
            self.shutdown()
258
        parameterValidator = ParameterValidator(self.ocrd_tool)
259
        report = parameterValidator.validate(parameter)
260
        if not report.is_valid:
261
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
262
        # make parameter dict read-only
263
        self._parameter = frozendict(parameter)
264
        # (re-)run setup to load models etc
265
        self.setup()
266
267
    def __init__(
268
            self,
269
            # FIXME: remove in favor of process_workspace(workspace)
270
            workspace : Optional[Workspace],
271
            ocrd_tool=None,
272
            parameter=None,
273
            input_file_grp=None,
274
            output_file_grp=None,
275
            page_id=None,
276
            download_files=config.OCRD_DOWNLOAD_INPUT,
277
            version=None
278
    ):
279
        """
280
        Instantiate, but do not setup (neither for processing nor other usage).
281
        If given, do parse and validate :py:data:`.parameter`.
282
283
        Args:
284
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
285
                 If not ``None``, then `chdir` to that directory.
286
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
287
                 before processing.
288
        Keyword Args:
289
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
290
                 Can be ``None`` even for processing, but then needs to be set before running.
291
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
292
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
293
                 before processing.
294
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
295
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
296
                 before processing.
297
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
298
                 (or empty for all pages). \
299
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
300
                 before processing.
301
             download_files (boolean): Whether input files will be downloaded prior to processing, \
302
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
303
        """
304
        if ocrd_tool is not None:
305
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
306
                                "use or override metadata/executable/ocrd-tool properties instead")
307
            self.ocrd_tool = ocrd_tool
308
            self.executable = ocrd_tool['executable']
309
        if version is not None:
310
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
311
                                "use or override metadata/version properties instead")
312
            self.version = version
313
        if workspace is not None:
314
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
315
                                "is deprecated - pass as argument to process_workspace instead")
316
            self.workspace = workspace
317
            self.old_pwd = getcwd()
318
            os.chdir(self.workspace.directory)
319
        if input_file_grp is not None:
320
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
321
                                "is deprecated - pass as argument to process_workspace instead")
322
            self.input_file_grp = input_file_grp
323
        if output_file_grp is not None:
324
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
325
                                "is deprecated - pass as argument to process_workspace instead")
326
            self.output_file_grp = output_file_grp
327
        if page_id is not None:
328
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
329
                                "is deprecated - pass as argument to process_workspace instead")
330
            self.page_id = page_id or None
331
        self.download = download_files
332
        #: The logger to be used by processor implementations.
333
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
334
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
335
        self._base_logger = getLogger('ocrd.processor.base')
336
        if parameter is not None:
337
            self.parameter = parameter
338
        # ensure that shutdown gets called at destruction
339
        self._finalizer = weakref.finalize(self, self.shutdown)
340
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
341
        setattr(self, 'process',
342
                deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
343
344
    def show_help(self, subcommand=None):
345
        """
346
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
347
        parameters and docstrings.
348
        """
349
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
350
351
    def show_version(self):
352
        """
353
        Print information on this processor's version and OCR-D version.
354
        """
355
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
356
357
    def verify(self):
358
        """
359
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
360
        """
361
        assert self.input_file_grp is not None
362
        assert self.output_file_grp is not None
363
        input_file_grps = self.input_file_grp.split(',')
364
        output_file_grps = self.output_file_grp.split(',')
365
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
366
            if isinstance(spec, int):
367
                if spec > 0:
368
                    assert len(grps) == spec, msg % (len(grps), str(spec))
369
            else:
370
                assert isinstance(spec, list)
371
                minimum = spec[0]
372
                maximum = spec[1]
373
                if minimum > 0:
374
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
375
                if maximum > 0:
376
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
377
        # FIXME: enforce unconditionally as soon as grace period for deprecation is over
378
        if 'input_file_grp_cardinality' in self.ocrd_tool:
379
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
380
                                        "Unexpected number of input file groups %d vs %s")
381
        if 'output_file_grp_cardinality' in self.ocrd_tool:
382
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
383
                                        "Unexpected number of output file groups %d vs %s")
384
        for input_file_grp in input_file_grps:
385
            assert input_file_grp in self.workspace.mets.file_groups
386
        # keep this for backwards compatibility:
387
        return True
388
389
    def dump_json(self):
390
        """
391
        Print :py:attr:`ocrd_tool` on stdout.
392
        """
393
        print(json.dumps(self.ocrd_tool, indent=True))
394
395
    def dump_module_dir(self):
396
        """
397
        Print :py:attr:`moduledir` on stdout.
398
        """
399
        print(self.moduledir)
400
401
    def list_resources(self):
402
        """
403
        Find all installed resource files in the search paths and print their path names.
404
        """
405
        for res in self.list_all_resources():
406
            print(res)
407
408
    def setup(self) -> None:
409
        """
410
        Prepare the processor for actual data processing,
411
        prior to changing to the workspace directory but
412
        after parsing parameters.
413
414
        (Override this to load models into memory etc.)
415
        """
416
        pass
417
418
    def shutdown(self) -> None:
419
        """
420
        Bring down the processor after data processing,
421
        after to changing back from the workspace directory but
422
        before exiting (or setting up with different parameters).
423
424
        (Override this to unload models from memory etc.)
425
        """
426
        pass
427
428
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
429
    def process(self) -> None:
430
        """
431
        Process all files of the :py:data:`workspace`
432
        from the given :py:data:`input_file_grp`
433
        to the given :py:data:`output_file_grp`
434
        for the given :py:data:`page_id` (or all pages)
435
        under the given :py:data:`parameter`.
436
437
        (This contains the main functionality and needs to be
438
        overridden by subclasses.)
439
        """
440
        raise NotImplementedError()
441
442
    def process_workspace(self, workspace: Workspace) -> None:
443
        """
444
        Process all files of the given ``workspace``,
445
        from the given :py:data:`input_file_grp`
446
        to the given :py:data:`output_file_grp`
447
        for the given :py:data:`page_id` (or all pages)
448
        under the given :py:data:`parameter`.
449
450
        (This will iterate over pages and files, calling
451
        :py:meth:`.process_page_file` and handling exceptions.
452
        It should be overridden by subclasses to handle cases
453
        like post-processing or computation across pages.)
454
        """
455
        with pushd_popd(workspace.directory):
456
            self.workspace = workspace
457
            self.verify()
458
            try:
459
                nr_succeeded = 0
460
                nr_skipped = 0
461
                nr_copied = 0
462
463
                # set up multithreading
464
                max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
465
                if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
466
                    self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
467
                    max_workers = self.max_workers
468
                if max_workers > 1:
469
                    assert isinstance(workspace.mets, ClientSideOcrdMets), \
470
                        "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
471
                max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
472
                if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
473
                    self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
474
                    max_seconds = self.max_page_seconds
475
                executor = ThreadPoolExecutor(
476
                    max_workers=max_workers or 1,
477
                    thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
478
                )
479
                self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
480
                tasks = {}
481
482
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
483
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
484
                    page_id = next(input_file.pageId
485
                                   for input_file in input_file_tuple
486
                                   if input_file)
487
                    self._base_logger.info(f"preparing page {page_id}")
488
                    for i, input_file in enumerate(input_file_tuple):
489
                        if input_file is None:
490
                            # file/page not found in this file grp
491
                            continue
492
                        input_files[i] = input_file
493
                        if not self.download:
494
                            continue
495
                        try:
496
                            input_files[i] = self.workspace.download_file(input_file)
497
                        except (ValueError, FileNotFoundError, HTTPError) as e:
498
                            self._base_logger.error(repr(e))
499
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
500
                    # process page
501
                    tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
502
                self._base_logger.debug("submitted %d processing tasks", len(tasks))
503
504
                for task in tasks:
505
                    # wait for results, handle errors
506
                    page_id, input_files = tasks[task]
507
                    # FIXME: differentiate error cases in various ways:
508
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
509
                    # - transient (I/O or OOM) error → maybe sleep, retry
510
                    # - persistent (data) error → skip / dummy / raise
511
                    try:
512
                        self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
513
                        task.result(timeout=max_seconds or None)
514
                        nr_succeeded += 1
515
                    # exclude NotImplementedError, so we can try process() below
516
                    except NotImplementedError:
517
                        raise
518
                    # handle input failures separately
519
                    except FileExistsError as err:
520
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
521
                            raise err
522
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
523
                            continue
524
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
525
                            # too late here, must not happen
526
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
527
                    # broad coverage of output failures (including TimeoutError)
528
                    except (Exception, TimeoutError) as err:
529
                        # FIXME: add re-usable/actionable logging
530
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
531
                            self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
532
                            raise err
533
                        self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
534
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
535
                            nr_skipped += 1
536
                            continue
537
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
538
                            self._copy_page_file(input_files[0])
539
                            nr_copied += 1
540
                        else:
541
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
542
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
543
544
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
545
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
546
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
547
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
548
                executor.shutdown()
549
550
            except NotImplementedError:
551
                # fall back to deprecated method
552
                self.process()
553
554
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
555
        """
556
        Copy the given ``input_file`` of the :py:data:`workspace`,
557
        representing one physical page (passed as one opened
558
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
559
        and add it as if it was a processing result.
560
        """
561
        input_pcgts : OcrdPage
562
        assert isinstance(input_file, get_args(OcrdFileType))
563
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
564
        try:
565
            input_pcgts = page_from_file(input_file)
566
        except ValueError as err:
567
            # not PAGE and not an image to generate PAGE for
568
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
569
            return
570
        output_file_id = make_file_id(input_file, self.output_file_grp)
571
        input_pcgts.set_pcGtsId(output_file_id)
572
        self.add_metadata(input_pcgts)
573
        self.workspace.add_file(
574
            file_id=output_file_id,
575
            file_grp=self.output_file_grp,
576
            page_id=input_file.pageId,
577
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
578
            mimetype=MIMETYPE_PAGE,
579
            content=to_xml(input_pcgts),
580
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
581
        )
582
583
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
584
        """
585
        Process the given ``input_files`` of the :py:data:`workspace`,
586
        representing one physical page (passed as one opened
587
        :py:class:`.OcrdFile` per input fileGrp)
588
        under the given :py:data:`.parameter`, and make sure the
589
        results get added accordingly.
590
591
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
592
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
593
        """
594
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
595
        assert isinstance(input_files[0], get_args(OcrdFileType))
596
        page_id = input_files[0].pageId
597
        self._base_logger.info("processing page %s", page_id)
598
        for i, input_file in enumerate(input_files):
599
            assert isinstance(input_file, get_args(OcrdFileType))
600
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
601
            try:
602
                page_ = page_from_file(input_file)
603
                assert isinstance(page_, OcrdPage)
604
                input_pcgts[i] = page_
605
            except ValueError as err:
606
                # not PAGE and not an image to generate PAGE for
607
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
608
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
609
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
610
        for image_result in result.images:
611
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
612
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
613
            if isinstance(image_result.alternative_image, PageType):
614
                # special case: not an alternative image, but replacing the original image
615
                # (this is needed by certain processors when the original's coordinate system
616
                #  cannot or must not be kept)
617
                image_result.alternative_image.set_imageFilename(image_file_path)
618
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
619
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
620
            elif isinstance(image_result.alternative_image, AlternativeImageType):
621
                image_result.alternative_image.set_filename(image_file_path)
622
            elif image_result.alternative_image is None:
623
                pass # do not reference in PAGE result
624
            else:
625
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
626
                                 f"{type(image_result.alternative_image)}")
627
            self.workspace.save_image_file(
628
                image_result.pil,
629
                image_file_id,
630
                self.output_file_grp,
631
                page_id=page_id,
632
                file_path=image_file_path,
633
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
634
            )
635
        result.pcgts.set_pcGtsId(output_file_id)
636
        self.add_metadata(result.pcgts)
637
        self.workspace.add_file(
638
            file_id=output_file_id,
639
            file_grp=self.output_file_grp,
640
            page_id=page_id,
641
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
642
            mimetype=MIMETYPE_PAGE,
643
            content=to_xml(result.pcgts),
644
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
645
        )
646
647
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
648
        """
649
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
650
        representing one physical page (passed as one parsed
651
        :py:class:`.OcrdPage` per input fileGrp)
652
        under the given :py:data:`.parameter`, and return the
653
        resulting :py:class:`.OcrdPageResult`.
654
655
        Optionally, add to the ``images`` attribute of the resulting
656
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
657
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
658
        ``file_id_suffix`` (used for generating IDs of the saved image) and
659
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
660
        for setting the filename of the saved image).
661
662
        (This contains the main functionality and must be overridden by subclasses,
663
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
664
        """
665
        raise NotImplementedError()
666
667
    def add_metadata(self, pcgts: OcrdPage) -> None:
668
        """
669
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
670
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
671
        """
672
        metadata_obj = pcgts.get_Metadata()
673
        assert metadata_obj is not None
674
        metadata_obj.add_MetadataItem(
675
                MetadataItemType(type_="processingStep",
676
                    name=self.ocrd_tool['steps'][0],
677
                    value=self.ocrd_tool['executable'],
678
                    Labels=[LabelsType(
679
                        externalModel="ocrd-tool",
680
                        externalId="parameters",
681
                        Label=[LabelType(type_=name,
682
                                         value=self.parameter[name])
683
                               for name in self.parameter.keys()]),
684
                            LabelsType(
685
                        externalModel="ocrd-tool",
686
                        externalId="version",
687
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
688
                                         value=self.version),
689
                               LabelType(type_='ocrd/core',
690
                                         value=OCRD_VERSION)])
691
                    ]))
692
693
    def resolve_resource(self, val):
694
        """
695
        Resolve a resource name to an absolute file path with the algorithm in
696
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
697
698
        Args:
699
            val (string): resource value to resolve
700
        """
701
        executable = self.ocrd_tool['executable']
702
        if exists(val):
703
            self._base_logger.debug("Resolved to absolute path %s" % val)
704
            return val
705
        # FIXME: remove once workspace arg / old_pwd is gone:
706
        if hasattr(self, 'old_pwd'):
707
            cwd = self.old_pwd
708
        else:
709
            cwd = getcwd()
710
        ret = [cand for cand in list_resource_candidates(executable, val,
711
                                                         cwd=cwd, moduled=self.moduledir)
712
               if exists(cand)]
713
        if ret:
714
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
715
            return ret[0]
716
        raise ResourceNotFoundError(val, executable)
717
718
    def show_resource(self, val):
719
        """
720
        Resolve a resource name to a file path with the algorithm in
721
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
722
        then print its contents to stdout.
723
724
        Args:
725
            val (string): resource value to show
726
        """
727
        res_fname = self.resolve_resource(val)
728
        fpath = Path(res_fname)
729
        if fpath.is_dir():
730
            with pushd_popd(fpath):
731
                fileobj = io.BytesIO()
732
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
733
                    tarball.add('.')
734
                fileobj.seek(0)
735
                copyfileobj(fileobj, sys.stdout.buffer)
736
        else:
737
            sys.stdout.buffer.write(fpath.read_bytes())
738
739
    def list_all_resources(self):
740
        """
741
        List all resources found in the filesystem and matching content-type by filename suffix
742
        """
743
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
744
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
745
            res = Path(res)
746
            if not '*/*' in mimetypes:
747
                if res.is_dir() and not 'text/directory' in mimetypes:
748
                    continue
749
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
750
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
751
                                             for mime in mimetypes):
752
                    continue
753
            yield res
754
755
    @property
756
    def module(self):
757
        """
758
        The top-level module this processor belongs to.
759
        """
760
        # find shortest prefix path that is not just a namespace package
761
        fqname = ''
762
        for name in self.__module__.split('.'):
763
            if fqname:
764
                fqname += '.'
765
            fqname += name
766
            if getattr(sys.modules[fqname], '__file__', None):
767
                return fqname
768
        # fall-back
769
        return self.__module__
770
771
    @property
772
    def moduledir(self):
773
        """
774
        The filesystem path of the module directory.
775
        """
776
        return resource_filename(self.module, '.')
777
778
    @property
779
    def input_files(self):
780
        """
781
        List the input files (for single-valued :py:attr:`input_file_grp`).
782
783
        For each physical page:
784
785
        - If there is a single PAGE-XML for the page, take it (and forget about all
786
          other files for that page)
787
        - Else if there is a single image file, take it (and forget about all other
788
          files for that page)
789
        - Otherwise raise an error (complaining that only PAGE-XML warrants
790
          having multiple images for a single page)
791
792
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
793
794
        Returns:
795
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
796
        """
797
        if not self.input_file_grp:
798
            raise ValueError("Processor is missing input fileGrp")
799
        ret = self.zip_input_files(mimetype=None, on_error='abort')
800
        if not ret:
801
            return []
802
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
803
        return [tuples[0] for tuples in ret]
804
805
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
806
        """
807
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
808
809
        Processors that expect/need multiple input file groups,
810
        cannot use :py:data:`input_files`. They must align (zip) input files
811
        across pages. This includes the case where not all pages
812
        are equally present in all file groups. It also requires
813
        making a consistent selection if there are multiple files
814
        per page.
815
816
        Following the OCR-D functional model, this function tries to
817
        find a single PAGE file per page, or fall back to a single
818
        image file per page. In either case, multiple matches per page
819
        are an error (see error handling below).
820
        This default behaviour can be changed by using a fixed MIME
821
        type filter via :py:attr:`mimetype`. But still, multiple matching
822
        files per page are an error.
823
824
        Single-page multiple-file errors are handled according to
825
        :py:attr:`on_error`:
826
827
        - if ``skip``, then the page for the respective fileGrp will be
828
          silently skipped (as if there was no match at all)
829
        - if ``first``, then the first matching file for the page will be
830
          silently selected (as if the first was the only match)
831
        - if ``last``, then the last matching file for the page will be
832
          silently selected (as if the last was the only match)
833
        - if ``abort``, then an exception will be raised.
834
835
        Multiple matches for PAGE-XML will always raise an exception.
836
837
        Keyword Args:
838
             require_first (boolean): If true, then skip a page entirely
839
                 whenever it is not available in the first input `fileGrp`.
840
             on_error (string): How to handle multiple file matches per page.
841
             mimetype (string): If not `None`, filter by the specified MIME
842
                 type (literal or regex prefixed by `//`). Otherwise prefer
843
                 PAGE or image.
844
        Returns:
845
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
846
        """
847
        if not self.input_file_grp:
848
            raise ValueError("Processor is missing input fileGrp")
849
850
        ifgs = self.input_file_grp.split(",")
851
        # Iterating over all files repeatedly may seem inefficient at first sight,
852
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
853
        # can actually be much more costly than traversing the ltree.
854
        # This might depend on the number of pages vs number of fileGrps.
855
856
        pages = {}
857
        for i, ifg in enumerate(ifgs):
858
            files_ = sorted(self.workspace.mets.find_all_files(
859
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
860
                                # sort by MIME type so PAGE comes before images
861
                                key=lambda file_: file_.mimetype)
862
            for file_ in files_:
863
                if not file_.pageId:
864
                    # ignore document-global files
865
                    continue
866
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
867
                if ift[i]:
868
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
869
                    # fileGrp has multiple files for this page ID
870
                    if mimetype:
871
                        # filter was active, this must not happen
872
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
873
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
874 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
875
                            ift[i] = None
876
                        elif on_error == 'first':
877
                            pass # keep first match
878
                        elif on_error == 'last':
879
                            ift[i] = file_
880
                        elif on_error == 'abort':
881
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
882
                        else:
883
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
884
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
885
                          file_.mimetype != MIMETYPE_PAGE):
886
                        pass # keep PAGE match
887
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
888
                          file_.mimetype == MIMETYPE_PAGE):
889
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
890
                    else:
891
                        # filter was inactive but no PAGE is in control, this must not happen
892
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
893
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
894 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
895
                            ift[i] = None
896
                        elif on_error == 'first':
897
                            pass # keep first match
898
                        elif on_error == 'last':
899
                            ift[i] = file_
900
                        elif on_error == 'abort':
901
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
902
                        else:
903
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
904
                else:
905
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
906
                    ift[i] = file_
907
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
908
        if self.page_id and not any(pages):
909
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
910
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
911
        ifts = []
912
        for page, ifiles in pages.items():
913
            for i, ifg in enumerate(ifgs):
914
                if not ifiles[i]:
915
                    # could be from non-unique with on_error=skip or from true gap
916
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
917
                    if config.OCRD_MISSING_INPUT == 'abort':
918
                        raise MissingInputFile(ifg, page, mimetype)
919
            if not any(ifiles):
920
                # must be from non-unique with on_error=skip
921
                self._base_logger.warning(f'Found no files for {page} - skipping')
922
                continue
923
            if ifiles[0] or not require_first:
924
                ifts.append(tuple(ifiles))
925
        return ifts
926
927
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
928
    """Generate a string describing the full CLI of this processor including params.
929
930
    Args:
931
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
932
         processor_instance (object, optional): the processor implementation
933
             (for adding any module/class/function docstrings)
934
        subcommand (string): 'worker' or 'server'
935
    """
936
    doc_help = ''
937
    if processor_instance:
938
        module = inspect.getmodule(processor_instance)
939
        if module and module.__doc__:
940
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
941
        if processor_instance.__doc__:
942
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
943
        # Try to find the most concrete docstring among the various methods that an implementation
944
        # could overload, first serving.
945
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
946
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
947
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
948
            instance_method = getattr(processor_instance, method)
949
            superclass_method = getattr(Processor, method)
950
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
951
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
952
                break
953
        if doc_help:
954
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
955
                                          initial_indent='  > ',
956
                                          subsequent_indent='  > ',
957
                                          preserve_paragraphs=True)
958
    subcommands = '''\
959
    worker      Start a processing worker rather than do local processing
960
    server      Start a processor server rather than do local processing
961
'''
962
963
    processing_worker_options = '''\
964
  --queue                         The RabbitMQ server address in format
965
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
966
                                  [amqp://admin:admin@localhost:5672]
967
  --database                      The MongoDB server address in format
968
                                  "mongodb://{host}:{port}"
969
                                  [mongodb://localhost:27018]
970
  --log-filename                  Filename to redirect STDOUT/STDERR to,
971
                                  if specified.
972
'''
973
974
    processing_server_options = '''\
975
  --address                       The Processor server address in format
976
                                  "{host}:{port}"
977
  --database                      The MongoDB server address in format
978
                                  "mongodb://{host}:{port}"
979
                                  [mongodb://localhost:27018]
980
'''
981
982
    processing_options = '''\
983
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
984
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
985
  -I, --input-file-grp USE        File group(s) used as input
986
  -O, --output-file-grp USE       File group(s) used as output
987
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
988
  --overwrite                     Remove existing output pages/images
989
                                  (with "--page-id", remove only those).
990
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
991
  --debug                         Abort on any errors with full stack trace.
992
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
993
  --profile                       Enable profiling
994
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
995
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
996
                                  or JSON file path
997
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
998
                                  taking precedence over --parameter
999
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
1000
                                  If URL starts with http:// start an HTTP server there,
1001
                                  otherwise URL is a path to an on-demand-created unix socket
1002
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
1003
                                  Override log level globally [INFO]
1004
  --log-filename LOG-PATH         File to redirect stderr logging to (overriding ocrd_logging.conf).
1005
'''
1006
1007
    information_options = '''\
1008
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
1009
  -L, --list-resources            List names of processor resources
1010
  -J, --dump-json                 Dump tool description as JSON
1011
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
1012
  -h, --help                      Show this message
1013
  -V, --version                   Show version
1014
'''
1015
1016
    parameter_help = ''
1017
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1018
        parameter_help = '  NONE\n'
1019
    else:
1020
        def wrap(s):
1021
            return wrap_text(s, initial_indent=' '*3,
1022
                             subsequent_indent=' '*4,
1023
                             width=72, preserve_paragraphs=True)
1024
        for param_name, param in ocrd_tool['parameters'].items():
1025
            parameter_help += wrap('"%s" [%s%s]' % (
1026
                param_name,
1027
                param['type'],
1028
                ' - REQUIRED' if 'required' in param and param['required'] else
1029
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1030
            parameter_help += '\n ' + wrap(param['description'])
1031
            if 'enum' in param:
1032
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1033
            parameter_help += "\n"
1034
1035
    if not subcommand:
1036
        return f'''\
1037
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1038
1039
  {ocrd_tool['description']}{doc_help}
1040
1041
Subcommands:
1042
{subcommands}
1043
Options for processing:
1044
{processing_options}
1045
Options for information:
1046
{information_options}
1047
Parameters:
1048
{parameter_help}
1049
'''
1050
    elif subcommand == 'worker':
1051
        return f'''\
1052
Usage: {ocrd_tool['executable']} worker [OPTIONS]
1053
1054
  Run {ocrd_tool['executable']} as a processing worker.
1055
1056
  {ocrd_tool['description']}{doc_help}
1057
1058
Options:
1059
{processing_worker_options}
1060
'''
1061
    elif subcommand == 'server':
1062
        return f'''\
1063
Usage: {ocrd_tool['executable']} server [OPTIONS]
1064
1065
  Run {ocrd_tool['executable']} as a processor sever.
1066
1067
  {ocrd_tool['description']}{doc_help}
1068
1069
Options:
1070
{processing_server_options}
1071
'''
1072
    else:
1073
        pass
1074