Passed
Pull Request — master (#1240)
by Konstantin
03:00
created

ocrd.processor.base.Processor.verify()   B

Complexity

Conditions 6

Size

Total Lines 28
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 23
dl 0
loc 28
rs 8.3946
c 0
b 0
f 0
cc 6
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
28
from click import wrap_text
29
from deprecated import deprecated
30
from requests import HTTPError
31
32
from ..workspace import Workspace
33
from ..mets_server import ClientSideOcrdMets
34
from ocrd_models.ocrd_file import OcrdFileType
35
from .ocrd_page_result import OcrdPageResult
36
from ocrd_utils import (
37
    VERSION as OCRD_VERSION,
38
    MIMETYPE_PAGE,
39
    MIME_TO_EXT,
40
    config,
41
    getLogger,
42
    list_resource_candidates,
43
    pushd_popd,
44
    list_all_resources,
45
    get_processor_resource_types,
46
    resource_filename,
47
    parse_json_file_with_comments,
48
    make_file_id,
49
    deprecation_warning
50
)
51
from ocrd_validators import ParameterValidator
52
from ocrd_models.ocrd_page import (
53
    PageType,
54
    AlternativeImageType,
55
    MetadataItemType,
56
    LabelType,
57
    LabelsType,
58
    OcrdPage,
59
    to_xml,
60
)
61
from ocrd_modelfactory import page_from_file
62
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
63
64
# XXX imports must remain for backwards-compatibility
65
from .helpers import run_cli, run_processor # pylint: disable=unused-import
66
67
68
class ResourceNotFoundError(FileNotFoundError):
69
    """
70
    An exception signifying the requested processor resource
71
    cannot be resolved.
72
    """
73
    def __init__(self, name, executable):
74
        self.name = name
75
        self.executable = executable
76
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
77
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
78
        super().__init__(self.message)
79
80
class NonUniqueInputFile(ValueError):
81
    """
82
    An exception signifying the specified fileGrp / pageId / mimetype
83
    selector yields multiple PAGE files, or no PAGE files but multiple images,
84
    or multiple files of that mimetype.
85
    """
86
    def __init__(self, fileGrp, pageId, mimetype):
87
        self.fileGrp = fileGrp
88
        self.pageId = pageId
89
        self.mimetype = mimetype
90
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
91
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
92
        super().__init__(self.message)
93
94
class MissingInputFile(ValueError):
95
    """
96
    An exception signifying the specified fileGrp / pageId / mimetype
97
    selector yields no PAGE files, or no PAGE and no image files,
98
    or no files of that mimetype.
99
    """
100
    def __init__(self, fileGrp, pageId, mimetype):
101
        self.fileGrp = fileGrp
102
        self.pageId = pageId
103
        self.mimetype = mimetype
104
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
105
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
        super().__init__(self.message)
107
108
class Processor():
109
    """
110
    A processor is a tool that implements the uniform OCR-D
111
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
112
113
    That is, it executes a single workflow step, or a combination of workflow steps,
114
    on the workspace (represented by local METS). It reads input files for all or selected
115
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
116
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
117
    parameters.
118
    """
119
120
    max_instances : int = -1
121
    """
122
    maximum number of cached instances (ignored if negative), to be applied on top of
123
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
124
125
    (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
    """
127
128
    max_workers : int = -1
129
    """
130
    maximum number of processor threads for page-parallel processing (ignored if negative),
131
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
    whatever is smaller).
133
134
    (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
    - at once, or if your class is not thread-safe.)
136
    """
137
138
    max_page_seconds : int = -1
139
    """
140
    maximum number of seconds may be spent processing a single page (ignored if negative),
141
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
    (i.e. whatever is smaller).
143
144
    (Override this if you know how costly this processor may be, irrespective of image size
145
    or complexity of the page.)
146
    """
147
148
    @property
149
    def metadata_filename(self) -> str:
150
        """
151
        Relative location of the ``ocrd-tool.json`` file inside the package.
152
153
        Used by :py:data:`metadata_location`.
154
155
        (Override if ``ocrd-tool.json`` is not in the root of the module,
156
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
157
        """
158
        return 'ocrd-tool.json'
159
160
    @cached_property
161
    def metadata_location(self) -> Path:
162
        """
163
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
164
165
        Used by :py:data:`metadata_rawdict`.
166
167
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
        """
169
        # XXX HACK
170
        module_tokens = self.__module__.split('.')
171
        if module_tokens[0] == 'src':
172
            module_tokens.pop(0)
173
        return resource_filename(module_tokens[0], self.metadata_filename)
174
175
    @cached_property
176
    def metadata_rawdict(self) -> dict:
177
        """
178
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
179
180
        Used by :py:data:`metadata`.
181
182
        (Override if ``ocrd-tool.json`` is not in a file.)
183
        """
184
        return parse_json_file_with_comments(self.metadata_location)
185
186
    @cached_property
187
    def metadata(self) -> dict:
188
        """
189
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
190
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
191
192
        After deserialisation, it also gets validated against the
193
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
194
        expanded.
195
196
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
197
198
        (Override if you want to provide metadata programmatically instead of a
199
        JSON file.)
200
        """
201
        metadata = self.metadata_rawdict
202
        report = OcrdToolValidator.validate(metadata)
203
        if not report.is_valid:
204
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
205
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
206
        return metadata
207
208
    @cached_property
209
    def version(self) -> str:
210
        """
211
        The program version of the package.
212
        Usually the ``version`` part of :py:data:`metadata`.
213
214
        (Override if you do not want to use :py:data:`metadata` lookup
215
        mechanism.)
216
        """
217
        return self.metadata['version']
218
219
    @cached_property
220
    def executable(self) -> str:
221
        """
222
        The executable name of this processor tool. Taken from the runtime
223
        filename.
224
225
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
226
227
        (Override if your entry-point name deviates from the ``executable``
228
        name, or the processor gets instantiated from another runtime.)
229
        """
230
        return os.path.basename(inspect.stack()[-1].filename)
231
232
    @cached_property
233
    def ocrd_tool(self) -> dict:
234
        """
235
        The ``ocrd-tool.json`` dict contents of this processor tool.
236
        Usually the :py:data:`executable` key of the ``tools`` part
237
        of :py:data:`metadata`.
238
239
        (Override if you do not want to use :py:data:`metadata` lookup
240
        mechanism.)
241
        """
242
        return self.metadata['tools'][self.executable]
243
244
    @property
245
    def parameter(self) -> Optional[dict]:
246
        """the runtime parameter dict to be used by this processor"""
247
        if hasattr(self, '_parameter'):
248
            return self._parameter
249
        return None
250
251
    @parameter.setter
252
    def parameter(self, parameter : dict) -> None:
253
        if self.parameter is not None:
254
            self.shutdown()
255
        parameterValidator = ParameterValidator(self.ocrd_tool)
256
        report = parameterValidator.validate(parameter)
257
        if not report.is_valid:
258
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
259
        # make parameter dict read-only
260
        self._parameter = frozendict(parameter)
261
        # (re-)run setup to load models etc
262
        self.setup()
263
264
    def __init__(
265
            self,
266
            # FIXME: remove in favor of process_workspace(workspace)
267
            workspace : Optional[Workspace],
268
            ocrd_tool=None,
269
            parameter=None,
270
            input_file_grp=None,
271
            output_file_grp=None,
272
            page_id=None,
273
            download_files=config.OCRD_DOWNLOAD_INPUT,
274
            version=None
275
    ):
276
        """
277
        Instantiate, but do not setup (neither for processing nor other usage).
278
        If given, do parse and validate :py:data:`.parameter`.
279
280
        Args:
281
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
282
                 If not ``None``, then `chdir` to that directory.
283
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
284
                 before processing.
285
        Keyword Args:
286
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
287
                 Can be ``None`` even for processing, but then needs to be set before running.
288
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
289
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
290
                 before processing.
291
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
292
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
293
                 before processing.
294
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
295
                 (or empty for all pages). \
296
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
297
                 before processing.
298
             download_files (boolean): Whether input files will be downloaded prior to processing, \
299
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
300
        """
301
        if ocrd_tool is not None:
302
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
303
                                "use or override metadata/executable/ocrd-tool properties instead")
304
            self.ocrd_tool = ocrd_tool
305
            self.executable = ocrd_tool['executable']
306
        if version is not None:
307
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
308
                                "use or override metadata/version properties instead")
309
            self.version = version
310
        if workspace is not None:
311
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
312
                                "is deprecated - pass as argument to process_workspace instead")
313
            self.workspace = workspace
314
            self.old_pwd = getcwd()
315
            os.chdir(self.workspace.directory)
316
        if input_file_grp is not None:
317
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
318
                                "is deprecated - pass as argument to process_workspace instead")
319
            self.input_file_grp = input_file_grp
320
        if output_file_grp is not None:
321
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
322
                                "is deprecated - pass as argument to process_workspace instead")
323
            self.output_file_grp = output_file_grp
324
        if page_id is not None:
325
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
326
                                "is deprecated - pass as argument to process_workspace instead")
327
            self.page_id = page_id or None
328
        self.download = download_files
329
        #: The logger to be used by processor implementations.
330
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
331
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
332
        self._base_logger = getLogger('ocrd.processor.base')
333
        if parameter is not None:
334
            self.parameter = parameter
335
        # ensure that shutdown gets called at destruction
336
        self._finalizer = weakref.finalize(self, self.shutdown)
337
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
338
        setattr(self, 'process',
339
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
340
341
    def show_help(self, subcommand=None):
342
        """
343
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
344
        parameters and docstrings.
345
        """
346
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
347
348
    def show_version(self):
349
        """
350
        Print information on this processor's version and OCR-D version.
351
        """
352
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
353
354
    def verify(self):
355
        """
356
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
357
        """
358
        assert self.input_file_grp is not None
359
        assert self.output_file_grp is not None
360
        input_file_grps = self.input_file_grp.split(',')
361
        output_file_grps = self.output_file_grp.split(',')
362
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
363
            if isinstance(spec, int):
364
                if spec > 0:
365
                    assert len(grps) == spec, msg % (len(grps), str(spec))
366
            else:
367
                assert isinstance(spec, list)
368
                minimum = spec[0]
369
                maximum = spec[1]
370
                if minimum > 0:
371
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
372
                if maximum > 0:
373
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
374
        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
375
                                    "Unexpected number of input file groups %d vs %s")
376
        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
377
                                    "Unexpected number of output file groups %d vs %s")
378
        for input_file_grp in input_file_grps:
379
            assert input_file_grp in self.workspace.mets.file_groups
380
        # keep this for backwards compatibility:
381
        return True
382
383
    def dump_json(self):
384
        """
385
        Print :py:attr:`ocrd_tool` on stdout.
386
        """
387
        print(json.dumps(self.ocrd_tool, indent=True))
388
389
    def dump_module_dir(self):
390
        """
391
        Print :py:attr:`moduledir` on stdout.
392
        """
393
        print(self.moduledir)
394
395
    def list_resources(self):
396
        """
397
        Find all installed resource files in the search paths and print their path names.
398
        """
399
        for res in self.list_all_resources():
400
            print(res)
401
402
    def setup(self) -> None:
403
        """
404
        Prepare the processor for actual data processing,
405
        prior to changing to the workspace directory but
406
        after parsing parameters.
407
408
        (Override this to load models into memory etc.)
409
        """
410
        pass
411
412
    def shutdown(self) -> None:
413
        """
414
        Bring down the processor after data processing,
415
        after to changing back from the workspace directory but
416
        before exiting (or setting up with different parameters).
417
418
        (Override this to unload models from memory etc.)
419
        """
420
        pass
421
422
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
423
    def process(self) -> None:
424
        """
425
        Process all files of the :py:data:`workspace`
426
        from the given :py:data:`input_file_grp`
427
        to the given :py:data:`output_file_grp`
428
        for the given :py:data:`page_id` (or all pages)
429
        under the given :py:data:`parameter`.
430
431
        (This contains the main functionality and needs to be
432
        overridden by subclasses.)
433
        """
434
        raise NotImplementedError()
435
436
    def process_workspace(self, workspace: Workspace) -> None:
437
        """
438
        Process all files of the given ``workspace``,
439
        from the given :py:data:`input_file_grp`
440
        to the given :py:data:`output_file_grp`
441
        for the given :py:data:`page_id` (or all pages)
442
        under the given :py:data:`parameter`.
443
444
        (This will iterate over pages and files, calling
445
        :py:meth:`.process_page_file` and handling exceptions.
446
        It should be overridden by subclasses to handle cases
447
        like post-processing or computation across pages.)
448
        """
449
        with pushd_popd(workspace.directory):
450
            self.workspace = workspace
451
            self.verify()
452
            try:
453
                nr_succeeded = 0
454
                nr_skipped = 0
455
                nr_copied = 0
456
457
                # set up multithreading
458
                if self.max_workers <= 0:
459
                    max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
460
                else:
461
                    max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
462
                if max_workers > 1:
463
                    assert isinstance(workspace.mets, ClientSideOcrdMets), \
464
                        "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
465
                if self.max_page_seconds <= 0:
466
                    max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
467
                else:
468
                    max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
469
                executor = ThreadPoolExecutor(
470
                    max_workers=max_workers or 1,
471
                    thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
472
                )
473
                self._base_logger.debug("started executor %s", str(executor))
474
                tasks = {}
475
476
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
477
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
478
                    page_id = next(input_file.pageId
479
                                   for input_file in input_file_tuple
480
                                   if input_file)
481
                    self._base_logger.info(f"processing page {page_id}")
482
                    for i, input_file in enumerate(input_file_tuple):
483
                        if input_file is None:
484
                            # file/page not found in this file grp
485
                            continue
486
                        input_files[i] = input_file
487
                        if not self.download:
488
                            continue
489
                        try:
490
                            input_files[i] = self.workspace.download_file(input_file)
491
                        except (ValueError, FileNotFoundError, HTTPError) as e:
492
                            self._base_logger.error(repr(e))
493
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
494
                    # process page
495
                    tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
496
                self._base_logger.debug("submitted %d processing tasks", len(tasks))
497
498
                for task in tasks:
499
                    # wait for results, handle errors
500
                    page_id, input_files = tasks[task]
501
                    # FIXME: differentiate error cases in various ways:
502
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
503
                    # - transient (I/O or OOM) error → maybe sleep, retry
504
                    # - persistent (data) error → skip / dummy / raise
505
                    try:
506
                        self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
507
                        task.result(timeout=max_seconds or None)
508
                        nr_succeeded += 1
509
                    # exclude NotImplementedError, so we can try process() below
510
                    except NotImplementedError:
511
                        raise
512
                    # handle input failures separately
513
                    except FileExistsError as err:
514
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
515
                            raise err
516
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
517
                            continue
518
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
519
                            # too late here, must not happen
520
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
521
                    # broad coverage of output failures (including TimeoutError)
522
                    except (Exception, TimeoutError) as err:
523
                        # FIXME: add re-usable/actionable logging
524
                        self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
525
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
526
                            raise err
527
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
528
                            nr_skipped += 1
529
                            continue
530
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
531
                            self._copy_page_file(input_files[0])
532
                            nr_copied += 1
533
                        else:
534
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
535
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
536
537
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
538
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
539
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
540
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
541
                executor.shutdown()
542
543
            except NotImplementedError:
544
                # fall back to deprecated method
545
                self.process()
546
547
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
548
        """
549
        Copy the given ``input_file`` of the :py:data:`workspace`,
550
        representing one physical page (passed as one opened
551
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
552
        and add it as if it was a processing result.
553
        """
554
        input_pcgts : OcrdPage
555
        assert isinstance(input_file, get_args(OcrdFileType))
556
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
557
        try:
558
            input_pcgts = page_from_file(input_file)
559
        except ValueError as err:
560
            # not PAGE and not an image to generate PAGE for
561
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
562
            return
563
        output_file_id = make_file_id(input_file, self.output_file_grp)
564
        input_pcgts.set_pcGtsId(output_file_id)
565
        self.add_metadata(input_pcgts)
566
        self.workspace.add_file(
567
            file_id=output_file_id,
568
            file_grp=self.output_file_grp,
569
            page_id=input_file.pageId,
570
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
571
            mimetype=MIMETYPE_PAGE,
572
            content=to_xml(input_pcgts),
573
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
574
        )
575
576
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
577
        """
578
        Process the given ``input_files`` of the :py:data:`workspace`,
579
        representing one physical page (passed as one opened
580
        :py:class:`.OcrdFile` per input fileGrp)
581
        under the given :py:data:`.parameter`, and make sure the
582
        results get added accordingly.
583
584
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
585
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
586
        """
587
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
588
        assert isinstance(input_files[0], get_args(OcrdFileType))
589
        page_id = input_files[0].pageId
590
        for i, input_file in enumerate(input_files):
591
            assert isinstance(input_file, get_args(OcrdFileType))
592
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
593
            try:
594
                page_ = page_from_file(input_file)
595
                assert isinstance(page_, OcrdPage)
596
                input_pcgts[i] = page_
597
            except ValueError as err:
598
                # not PAGE and not an image to generate PAGE for
599
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
600
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
601
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
602
        for image_result in result.images:
603
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
604
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
605
            if isinstance(image_result.alternative_image, PageType):
606
                # special case: not an alternative image, but replacing the original image
607
                # (this is needed by certain processors when the original's coordinate system
608
                #  cannot or must not be kept)
609
                image_result.alternative_image.set_imageFilename(image_file_path)
610
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
611
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
612
            elif isinstance(image_result.alternative_image, AlternativeImageType):
613
                image_result.alternative_image.set_filename(image_file_path)
614
            else:
615
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
616
                                 f"{type(image_result.alternative_image)}")
617
            self.workspace.save_image_file(
618
                image_result.pil,
619
                image_file_id,
620
                self.output_file_grp,
621
                page_id=page_id,
622
                file_path=image_file_path,
623
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
624
            )
625
        result.pcgts.set_pcGtsId(output_file_id)
626
        self.add_metadata(result.pcgts)
627
        self.workspace.add_file(
628
            file_id=output_file_id,
629
            file_grp=self.output_file_grp,
630
            page_id=page_id,
631
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
632
            mimetype=MIMETYPE_PAGE,
633
            content=to_xml(result.pcgts),
634
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
635
        )
636
637
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
638
        """
639
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
640
        representing one physical page (passed as one parsed
641
        :py:class:`.OcrdPage` per input fileGrp)
642
        under the given :py:data:`.parameter`, and return the
643
        resulting :py:class:`.OcrdPageResult`.
644
645
        Optionally, add to the ``images`` attribute of the resulting
646
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
647
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
648
        ``file_id_suffix`` (used for generating IDs of the saved image) and
649
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
650
        for setting the filename of the saved image).
651
652
        (This contains the main functionality and must be overridden by subclasses,
653
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
654
        """
655
        raise NotImplementedError()
656
657
    def add_metadata(self, pcgts: OcrdPage) -> None:
658
        """
659
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
660
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
661
        """
662
        metadata_obj = pcgts.get_Metadata()
663
        assert metadata_obj is not None
664
        metadata_obj.add_MetadataItem(
665
                MetadataItemType(type_="processingStep",
666
                    name=self.ocrd_tool['steps'][0],
667
                    value=self.ocrd_tool['executable'],
668
                    Labels=[LabelsType(
669
                        externalModel="ocrd-tool",
670
                        externalId="parameters",
671
                        Label=[LabelType(type_=name,
672
                                         value=self.parameter[name])
673
                               for name in self.parameter.keys()]),
674
                            LabelsType(
675
                        externalModel="ocrd-tool",
676
                        externalId="version",
677
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
678
                                         value=self.version),
679
                               LabelType(type_='ocrd/core',
680
                                         value=OCRD_VERSION)])
681
                    ]))
682
683
    def resolve_resource(self, val):
684
        """
685
        Resolve a resource name to an absolute file path with the algorithm in
686
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
687
688
        Args:
689
            val (string): resource value to resolve
690
        """
691
        executable = self.ocrd_tool['executable']
692
        if exists(val):
693
            self._base_logger.debug("Resolved to absolute path %s" % val)
694
            return val
695
        # FIXME: remove once workspace arg / old_pwd is gone:
696
        if hasattr(self, 'old_pwd'):
697
            cwd = self.old_pwd
698
        else:
699
            cwd = getcwd()
700
        ret = [cand for cand in list_resource_candidates(executable, val,
701
                                                         cwd=cwd, moduled=self.moduledir)
702
               if exists(cand)]
703
        if ret:
704
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
705
            return ret[0]
706
        raise ResourceNotFoundError(val, executable)
707
708
    def show_resource(self, val):
709
        """
710
        Resolve a resource name to a file path with the algorithm in
711
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
712
        then print its contents to stdout.
713
714
        Args:
715
            val (string): resource value to show
716
        """
717
        res_fname = self.resolve_resource(val)
718
        fpath = Path(res_fname)
719
        if fpath.is_dir():
720
            with pushd_popd(fpath):
721
                fileobj = io.BytesIO()
722
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
723
                    tarball.add('.')
724
                fileobj.seek(0)
725
                copyfileobj(fileobj, sys.stdout.buffer)
726
        else:
727
            sys.stdout.buffer.write(fpath.read_bytes())
728
729
    def list_all_resources(self):
730
        """
731
        List all resources found in the filesystem and matching content-type by filename suffix
732
        """
733
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
734
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
735
            res = Path(res)
736
            if not '*/*' in mimetypes:
737
                if res.is_dir() and not 'text/directory' in mimetypes:
738
                    continue
739
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
740
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
741
                                             for mime in mimetypes):
742
                    continue
743
            yield res
744
745
    @property
746
    def module(self):
747
        """
748
        The top-level module this processor belongs to.
749
        """
750
        # find shortest prefix path that is not just a namespace package
751
        fqname = ''
752
        for name in self.__module__.split('.'):
753
            if fqname:
754
                fqname += '.'
755
            fqname += name
756
            if getattr(sys.modules[fqname], '__file__', None):
757
                return fqname
758
        # fall-back
759
        return self.__module__
760
761
    @property
762
    def moduledir(self):
763
        """
764
        The filesystem path of the module directory.
765
        """
766
        return resource_filename(self.module, '.')
767
768
    @property
769
    def input_files(self):
770
        """
771
        List the input files (for single-valued :py:attr:`input_file_grp`).
772
773
        For each physical page:
774
775
        - If there is a single PAGE-XML for the page, take it (and forget about all
776
          other files for that page)
777
        - Else if there is a single image file, take it (and forget about all other
778
          files for that page)
779
        - Otherwise raise an error (complaining that only PAGE-XML warrants
780
          having multiple images for a single page)
781
782
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
783
784
        Returns:
785
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
786
        """
787
        if not self.input_file_grp:
788
            raise ValueError("Processor is missing input fileGrp")
789
        ret = self.zip_input_files(mimetype=None, on_error='abort')
790
        if not ret:
791
            return []
792
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
793
        return [tuples[0] for tuples in ret]
794
795
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
796
        """
797
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
798
799
        Processors that expect/need multiple input file groups,
800
        cannot use :py:data:`input_files`. They must align (zip) input files
801
        across pages. This includes the case where not all pages
802
        are equally present in all file groups. It also requires
803
        making a consistent selection if there are multiple files
804
        per page.
805
806
        Following the OCR-D functional model, this function tries to
807
        find a single PAGE file per page, or fall back to a single
808
        image file per page. In either case, multiple matches per page
809
        are an error (see error handling below).
810
        This default behaviour can be changed by using a fixed MIME
811
        type filter via :py:attr:`mimetype`. But still, multiple matching
812
        files per page are an error.
813
814
        Single-page multiple-file errors are handled according to
815
        :py:attr:`on_error`:
816
817
        - if ``skip``, then the page for the respective fileGrp will be
818
          silently skipped (as if there was no match at all)
819
        - if ``first``, then the first matching file for the page will be
820
          silently selected (as if the first was the only match)
821
        - if ``last``, then the last matching file for the page will be
822
          silently selected (as if the last was the only match)
823
        - if ``abort``, then an exception will be raised.
824
825
        Multiple matches for PAGE-XML will always raise an exception.
826
827
        Keyword Args:
828
             require_first (boolean): If true, then skip a page entirely
829
                 whenever it is not available in the first input `fileGrp`.
830
             on_error (string): How to handle multiple file matches per page.
831
             mimetype (string): If not `None`, filter by the specified MIME
832
                 type (literal or regex prefixed by `//`). Otherwise prefer
833
                 PAGE or image.
834
        Returns:
835
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
836
        """
837
        if not self.input_file_grp:
838
            raise ValueError("Processor is missing input fileGrp")
839
840
        ifgs = self.input_file_grp.split(",")
841
        # Iterating over all files repeatedly may seem inefficient at first sight,
842
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
843
        # can actually be much more costly than traversing the ltree.
844
        # This might depend on the number of pages vs number of fileGrps.
845
846
        pages = {}
847
        for i, ifg in enumerate(ifgs):
848
            files_ = sorted(self.workspace.mets.find_all_files(
849
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
850
                                # sort by MIME type so PAGE comes before images
851
                                key=lambda file_: file_.mimetype)
852
            for file_ in files_:
853
                if not file_.pageId:
854
                    # ignore document-global files
855
                    continue
856
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
857
                if ift[i]:
858
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
859
                    # fileGrp has multiple files for this page ID
860
                    if mimetype:
861
                        # filter was active, this must not happen
862
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
863
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
864 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
865
                            ift[i] = None
866
                        elif on_error == 'first':
867
                            pass # keep first match
868
                        elif on_error == 'last':
869
                            ift[i] = file_
870
                        elif on_error == 'abort':
871
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
872
                        else:
873
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
874
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
875
                          file_.mimetype != MIMETYPE_PAGE):
876
                        pass # keep PAGE match
877
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
878
                          file_.mimetype == MIMETYPE_PAGE):
879
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
880
                    else:
881
                        # filter was inactive but no PAGE is in control, this must not happen
882
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
883
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
884 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
885
                            ift[i] = None
886
                        elif on_error == 'first':
887
                            pass # keep first match
888
                        elif on_error == 'last':
889
                            ift[i] = file_
890
                        elif on_error == 'abort':
891
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
892
                        else:
893
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
894
                else:
895
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
896
                    ift[i] = file_
897
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
898
        if self.page_id and not any(pages):
899
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
900
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
901
        ifts = []
902
        for page, ifiles in pages.items():
903
            for i, ifg in enumerate(ifgs):
904
                if not ifiles[i]:
905
                    # could be from non-unique with on_error=skip or from true gap
906
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
907
                    if config.OCRD_MISSING_INPUT == 'abort':
908
                        raise MissingInputFile(ifg, page, mimetype)
909
            if not any(ifiles):
910
                # must be from non-unique with on_error=skip
911
                self._base_logger.warning(f'Found no files for {page} - skipping')
912
                continue
913
            if ifiles[0] or not require_first:
914
                ifts.append(tuple(ifiles))
915
        return ifts
916
917
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
918
    """Generate a string describing the full CLI of this processor including params.
919
920
    Args:
921
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
922
         processor_instance (object, optional): the processor implementation
923
             (for adding any module/class/function docstrings)
924
        subcommand (string): 'worker' or 'server'
925
    """
926
    doc_help = ''
927
    if processor_instance:
928
        module = inspect.getmodule(processor_instance)
929
        if module and module.__doc__:
930
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
931
        if processor_instance.__doc__:
932
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
933
        # Try to find the most concrete docstring among the various methods that an implementation
934
        # could overload, first serving.
935
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
936
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
937
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
938
            instance_method = getattr(processor_instance, method)
939
            superclass_method = getattr(Processor, method)
940
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
941
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
942
                break
943
        if doc_help:
944
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
945
                                          initial_indent='  > ',
946
                                          subsequent_indent='  > ',
947
                                          preserve_paragraphs=True)
948
    subcommands = '''\
949
    worker      Start a processing worker rather than do local processing
950
    server      Start a processor server rather than do local processing
951
'''
952
953
    processing_worker_options = '''\
954
  --queue                         The RabbitMQ server address in format
955
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
956
                                  [amqp://admin:admin@localhost:5672]
957
  --database                      The MongoDB server address in format
958
                                  "mongodb://{host}:{port}"
959
                                  [mongodb://localhost:27018]
960
  --log-filename                  Filename to redirect STDOUT/STDERR to,
961
                                  if specified.
962
'''
963
964
    processing_server_options = '''\
965
  --address                       The Processor server address in format
966
                                  "{host}:{port}"
967
  --database                      The MongoDB server address in format
968
                                  "mongodb://{host}:{port}"
969
                                  [mongodb://localhost:27018]
970
'''
971
972
    processing_options = '''\
973
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
974
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
975
  -I, --input-file-grp USE        File group(s) used as input
976
  -O, --output-file-grp USE       File group(s) used as output
977
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
978
  --overwrite                     Remove existing output pages/images
979
                                  (with "--page-id", remove only those).
980
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
981
  --debug                         Abort on any errors with full stack trace.
982
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
983
  --profile                       Enable profiling
984
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
985
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
986
                                  or JSON file path
987
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
988
                                  taking precedence over --parameter
989
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
990
                                  If URL starts with http:// start an HTTP server there,
991
                                  otherwise URL is a path to an on-demand-created unix socket
992
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
993
                                  Override log level globally [INFO]
994
  --log-filename LOG-PATH         File to redirect stderr logging to (overriding ocrd_logging.conf).
995
'''
996
997
    information_options = '''\
998
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
999
  -L, --list-resources            List names of processor resources
1000
  -J, --dump-json                 Dump tool description as JSON
1001
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
1002
  -h, --help                      Show this message
1003
  -V, --version                   Show version
1004
'''
1005
1006
    parameter_help = ''
1007
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1008
        parameter_help = '  NONE\n'
1009
    else:
1010
        def wrap(s):
1011
            return wrap_text(s, initial_indent=' '*3,
1012
                             subsequent_indent=' '*4,
1013
                             width=72, preserve_paragraphs=True)
1014
        for param_name, param in ocrd_tool['parameters'].items():
1015
            parameter_help += wrap('"%s" [%s%s]' % (
1016
                param_name,
1017
                param['type'],
1018
                ' - REQUIRED' if 'required' in param and param['required'] else
1019
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1020
            parameter_help += '\n ' + wrap(param['description'])
1021
            if 'enum' in param:
1022
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1023
            parameter_help += "\n"
1024
1025
    if not subcommand:
1026
        return f'''\
1027
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1028
1029
  {ocrd_tool['description']}{doc_help}
1030
1031
Subcommands:
1032
{subcommands}
1033
Options for processing:
1034
{processing_options}
1035
Options for information:
1036
{information_options}
1037
Parameters:
1038
{parameter_help}
1039
'''
1040
    elif subcommand == 'worker':
1041
        return f'''\
1042
Usage: {ocrd_tool['executable']} worker [OPTIONS]
1043
1044
  Run {ocrd_tool['executable']} as a processing worker.
1045
1046
  {ocrd_tool['description']}{doc_help}
1047
1048
Options:
1049
{processing_worker_options}
1050
'''
1051
    elif subcommand == 'server':
1052
        return f'''\
1053
Usage: {ocrd_tool['executable']} server [OPTIONS]
1054
1055
  Run {ocrd_tool['executable']} as a processor sever.
1056
1057
  {ocrd_tool['description']}{doc_help}
1058
1059
Options:
1060
{processing_server_options}
1061
'''
1062
    else:
1063
        pass
1064