Passed
Pull Request — master (#1240)
by Konstantin
03:20
created

ocrd.processor.base.Processor.setup()   A

Complexity

Conditions 1

Size

Total Lines 9
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 9
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from functools import cached_property
13
from os.path import exists, join
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
from typing import List, Optional, Union, get_args
20
import sys
21
import inspect
22
import tarfile
23
import io
24
import weakref
25
from frozendict import frozendict
26
from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
28
from click import wrap_text
29
from deprecated import deprecated
30
from requests import HTTPError
31
32
from ..workspace import Workspace
33
from ..mets_server import ClientSideOcrdMets
34
from ocrd_models.ocrd_file import OcrdFileType
35
from .ocrd_page_result import OcrdPageResult
36
from ocrd_utils import (
37
    VERSION as OCRD_VERSION,
38
    MIMETYPE_PAGE,
39
    MIME_TO_EXT,
40
    config,
41
    getLogger,
42
    list_resource_candidates,
43
    pushd_popd,
44
    list_all_resources,
45
    get_processor_resource_types,
46
    resource_filename,
47
    parse_json_file_with_comments,
48
    make_file_id,
49
    deprecation_warning
50
)
51
from ocrd_validators import ParameterValidator
52
from ocrd_models.ocrd_page import (
53
    PageType,
54
    AlternativeImageType,
55
    MetadataItemType,
56
    LabelType,
57
    LabelsType,
58
    OcrdPage,
59
    to_xml,
60
)
61
from ocrd_modelfactory import page_from_file
62
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
63
64
# XXX imports must remain for backwards-compatibility
65
from .helpers import run_cli, run_processor # pylint: disable=unused-import
66
67
68
class ResourceNotFoundError(FileNotFoundError):
69
    """
70
    An exception signifying the requested processor resource
71
    cannot be resolved.
72
    """
73
    def __init__(self, name, executable):
74
        self.name = name
75
        self.executable = executable
76
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
77
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
78
        super().__init__(self.message)
79
80
class NonUniqueInputFile(ValueError):
81
    """
82
    An exception signifying the specified fileGrp / pageId / mimetype
83
    selector yields multiple PAGE files, or no PAGE files but multiple images,
84
    or multiple files of that mimetype.
85
    """
86
    def __init__(self, fileGrp, pageId, mimetype):
87
        self.fileGrp = fileGrp
88
        self.pageId = pageId
89
        self.mimetype = mimetype
90
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
91
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
92
        super().__init__(self.message)
93
94
class MissingInputFile(ValueError):
95
    """
96
    An exception signifying the specified fileGrp / pageId / mimetype
97
    selector yields no PAGE files, or no PAGE and no image files,
98
    or no files of that mimetype.
99
    """
100
    def __init__(self, fileGrp, pageId, mimetype):
101
        self.fileGrp = fileGrp
102
        self.pageId = pageId
103
        self.mimetype = mimetype
104
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
105
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
        super().__init__(self.message)
107
108
class Processor():
109
    """
110
    A processor is a tool that implements the uniform OCR-D
111
    `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
112
113
    That is, it executes a single workflow step, or a combination of workflow steps,
114
    on the workspace (represented by local METS). It reads input files for all or selected
115
    physical pages of the input fileGrp(s), computes additional annotation, and writes output
116
    files for them into the output fileGrp(s). It may take a number of optional or mandatory
117
    parameters.
118
    """
119
120
    max_instances : int = -1
121
    """
122
    maximum number of cached instances (ignored if negative), to be applied on top of
123
    :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
124
125
    (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
    """
127
128
    max_workers : int = -1
129
    """
130
    maximum number of processor threads for page-parallel processing (ignored if negative),
131
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
    whatever is smaller).
133
134
    (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
    - at once, or if your class is not thread-safe.)
136
    """
137
138
    max_page_seconds : int = -1
139
    """
140
    maximum number of seconds may be spent processing a single page (ignored if negative),
141
    to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
    (i.e. whatever is smaller).
143
144
    (Override this if you know how costly this processor may be, irrespective of image size
145
    or complexity of the page.)
146
    """
147
148
    @property
149
    def metadata_filename(self) -> str:
150
        """
151
        Relative location of the ``ocrd-tool.json`` file inside the package.
152
153
        Used by :py:data:`metadata_location`.
154
155
        (Override if ``ocrd-tool.json`` is not in the root of the module,
156
        e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
157
        """
158
        return 'ocrd-tool.json'
159
160
    @cached_property
161
    def metadata_location(self) -> Path:
162
        """
163
        Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
164
165
        Used by :py:data:`metadata_rawdict`.
166
167
        (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
        """
169
        module = inspect.getmodule(self)
170
        module_tokens = module.__package__.split('.')
171
        # for namespace packages, we cannot just use the first token
172
        for i in range(len(module_tokens)):
173
            prefix = '.'.join(module_tokens[:i + 1])
174
            if sys.modules[prefix].__spec__.has_location:
175
                return resource_filename(prefix, self.metadata_filename)
176
        raise Exception("cannot find top-level module prefix for %s", module.__package__)
177
178
    @cached_property
179
    def metadata_rawdict(self) -> dict:
180
        """
181
        Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
182
183
        Used by :py:data:`metadata`.
184
185
        (Override if ``ocrd-tool.json`` is not in a file.)
186
        """
187
        return parse_json_file_with_comments(self.metadata_location)
188
189
    @cached_property
190
    def metadata(self) -> dict:
191
        """
192
        The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
193
        `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
194
195
        After deserialisation, it also gets validated against the
196
        `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
197
        expanded.
198
199
        Used by :py:data:`ocrd_tool` and :py:data:`version`.
200
201
        (Override if you want to provide metadata programmatically instead of a
202
        JSON file.)
203
        """
204
        metadata = self.metadata_rawdict
205
        report = OcrdToolValidator.validate(metadata)
206
        if not report.is_valid:
207
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
208
                              f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
209
        return metadata
210
211
    @cached_property
212
    def version(self) -> str:
213
        """
214
        The program version of the package.
215
        Usually the ``version`` part of :py:data:`metadata`.
216
217
        (Override if you do not want to use :py:data:`metadata` lookup
218
        mechanism.)
219
        """
220
        return self.metadata['version']
221
222
    @cached_property
223
    def executable(self) -> str:
224
        """
225
        The executable name of this processor tool. Taken from the runtime
226
        filename.
227
228
        Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
229
230
        (Override if your entry-point name deviates from the ``executable``
231
        name, or the processor gets instantiated from another runtime.)
232
        """
233
        return os.path.basename(inspect.stack()[-1].filename)
234
235
    @cached_property
236
    def ocrd_tool(self) -> dict:
237
        """
238
        The ``ocrd-tool.json`` dict contents of this processor tool.
239
        Usually the :py:data:`executable` key of the ``tools`` part
240
        of :py:data:`metadata`.
241
242
        (Override if you do not want to use :py:data:`metadata` lookup
243
        mechanism.)
244
        """
245
        return self.metadata['tools'][self.executable]
246
247
    @property
248
    def parameter(self) -> Optional[dict]:
249
        """the runtime parameter dict to be used by this processor"""
250
        if hasattr(self, '_parameter'):
251
            return self._parameter
252
        return None
253
254
    @parameter.setter
255
    def parameter(self, parameter : dict) -> None:
256
        if self.parameter is not None:
257
            self.shutdown()
258
        parameterValidator = ParameterValidator(self.ocrd_tool)
259
        report = parameterValidator.validate(parameter)
260
        if not report.is_valid:
261
            raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
262
        # make parameter dict read-only
263
        self._parameter = frozendict(parameter)
264
        # (re-)run setup to load models etc
265
        self.setup()
266
267
    def __init__(
268
            self,
269
            # FIXME: remove in favor of process_workspace(workspace)
270
            workspace : Optional[Workspace],
271
            ocrd_tool=None,
272
            parameter=None,
273
            input_file_grp=None,
274
            output_file_grp=None,
275
            page_id=None,
276
            download_files=config.OCRD_DOWNLOAD_INPUT,
277
            version=None
278
    ):
279
        """
280
        Instantiate, but do not setup (neither for processing nor other usage).
281
        If given, do parse and validate :py:data:`.parameter`.
282
283
        Args:
284
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
285
                 If not ``None``, then `chdir` to that directory.
286
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
287
                 before processing.
288
        Keyword Args:
289
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
290
                 Can be ``None`` even for processing, but then needs to be set before running.
291
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
292
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
293
                 before processing.
294
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
295
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
296
                 before processing.
297
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
298
                 (or empty for all pages). \
299
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
300
                 before processing.
301
             download_files (boolean): Whether input files will be downloaded prior to processing, \
302
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
303
        """
304
        if ocrd_tool is not None:
305
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
306
                                "use or override metadata/executable/ocrd-tool properties instead")
307
            self.ocrd_tool = ocrd_tool
308
            self.executable = ocrd_tool['executable']
309
        if version is not None:
310
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
311
                                "use or override metadata/version properties instead")
312
            self.version = version
313
        if workspace is not None:
314
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
315
                                "is deprecated - pass as argument to process_workspace instead")
316
            self.workspace = workspace
317
            self.old_pwd = getcwd()
318
            os.chdir(self.workspace.directory)
319
        if input_file_grp is not None:
320
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
321
                                "is deprecated - pass as argument to process_workspace instead")
322
            self.input_file_grp = input_file_grp
323
        if output_file_grp is not None:
324
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
325
                                "is deprecated - pass as argument to process_workspace instead")
326
            self.output_file_grp = output_file_grp
327
        if page_id is not None:
328
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
329
                                "is deprecated - pass as argument to process_workspace instead")
330
            self.page_id = page_id or None
331
        self.download = download_files
332
        #: The logger to be used by processor implementations.
333
        # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
334
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
335
        self._base_logger = getLogger('ocrd.processor.base')
336
        if parameter is not None:
337
            self.parameter = parameter
338
        # ensure that shutdown gets called at destruction
339
        self._finalizer = weakref.finalize(self, self.shutdown)
340
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
341
        setattr(self, 'process',
342
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
343
344
    def show_help(self, subcommand=None):
345
        """
346
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
347
        parameters and docstrings.
348
        """
349
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
350
351
    def show_version(self):
352
        """
353
        Print information on this processor's version and OCR-D version.
354
        """
355
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
356
357
    def verify(self):
358
        """
359
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
360
        """
361
        assert self.input_file_grp is not None
362
        assert self.output_file_grp is not None
363
        input_file_grps = self.input_file_grp.split(',')
364
        output_file_grps = self.output_file_grp.split(',')
365
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
366
            if isinstance(spec, int):
367
                if spec > 0:
368
                    assert len(grps) == spec, msg % (len(grps), str(spec))
369
            else:
370
                assert isinstance(spec, list)
371
                minimum = spec[0]
372
                maximum = spec[1]
373
                if minimum > 0:
374
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
375
                if maximum > 0:
376
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
377
        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
378
                                    "Unexpected number of input file groups %d vs %s")
379
        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
380
                                    "Unexpected number of output file groups %d vs %s")
381
        for input_file_grp in input_file_grps:
382
            assert input_file_grp in self.workspace.mets.file_groups
383
        # keep this for backwards compatibility:
384
        return True
385
386
    def dump_json(self):
387
        """
388
        Print :py:attr:`ocrd_tool` on stdout.
389
        """
390
        print(json.dumps(self.ocrd_tool, indent=True))
391
392
    def dump_module_dir(self):
393
        """
394
        Print :py:attr:`moduledir` on stdout.
395
        """
396
        print(self.moduledir)
397
398
    def list_resources(self):
399
        """
400
        Find all installed resource files in the search paths and print their path names.
401
        """
402
        for res in self.list_all_resources():
403
            print(res)
404
405
    def setup(self) -> None:
406
        """
407
        Prepare the processor for actual data processing,
408
        prior to changing to the workspace directory but
409
        after parsing parameters.
410
411
        (Override this to load models into memory etc.)
412
        """
413
        pass
414
415
    def shutdown(self) -> None:
416
        """
417
        Bring down the processor after data processing,
418
        after to changing back from the workspace directory but
419
        before exiting (or setting up with different parameters).
420
421
        (Override this to unload models from memory etc.)
422
        """
423
        pass
424
425
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
426
    def process(self) -> None:
427
        """
428
        Process all files of the :py:data:`workspace`
429
        from the given :py:data:`input_file_grp`
430
        to the given :py:data:`output_file_grp`
431
        for the given :py:data:`page_id` (or all pages)
432
        under the given :py:data:`parameter`.
433
434
        (This contains the main functionality and needs to be
435
        overridden by subclasses.)
436
        """
437
        raise NotImplementedError()
438
439
    def process_workspace(self, workspace: Workspace) -> None:
440
        """
441
        Process all files of the given ``workspace``,
442
        from the given :py:data:`input_file_grp`
443
        to the given :py:data:`output_file_grp`
444
        for the given :py:data:`page_id` (or all pages)
445
        under the given :py:data:`parameter`.
446
447
        (This will iterate over pages and files, calling
448
        :py:meth:`.process_page_file` and handling exceptions.
449
        It should be overridden by subclasses to handle cases
450
        like post-processing or computation across pages.)
451
        """
452
        with pushd_popd(workspace.directory):
453
            self.workspace = workspace
454
            self.verify()
455
            try:
456
                nr_succeeded = 0
457
                nr_skipped = 0
458
                nr_copied = 0
459
460
                # set up multithreading
461
                max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
462
                if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
463
                    self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
464
                    max_workers = self.max_workers
465
                if max_workers > 1:
466
                    assert isinstance(workspace.mets, ClientSideOcrdMets), \
467
                        "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
468
                max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
469
                if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
470
                    self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
471
                    max_seconds = self.max_page_seconds
472
                executor = ThreadPoolExecutor(
473
                    max_workers=max_workers or 1,
474
                    thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
475
                )
476
                self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
477
                tasks = {}
478
479
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
480
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
481
                    page_id = next(input_file.pageId
482
                                   for input_file in input_file_tuple
483
                                   if input_file)
484
                    self._base_logger.info(f"preparing page {page_id}")
485
                    for i, input_file in enumerate(input_file_tuple):
486
                        if input_file is None:
487
                            # file/page not found in this file grp
488
                            continue
489
                        input_files[i] = input_file
490
                        if not self.download:
491
                            continue
492
                        try:
493
                            input_files[i] = self.workspace.download_file(input_file)
494
                        except (ValueError, FileNotFoundError, HTTPError) as e:
495
                            self._base_logger.error(repr(e))
496
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
497
                    # process page
498
                    tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
499
                self._base_logger.debug("submitted %d processing tasks", len(tasks))
500
501
                for task in tasks:
502
                    # wait for results, handle errors
503
                    page_id, input_files = tasks[task]
504
                    # FIXME: differentiate error cases in various ways:
505
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
506
                    # - transient (I/O or OOM) error → maybe sleep, retry
507
                    # - persistent (data) error → skip / dummy / raise
508
                    try:
509
                        self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
510
                        task.result(timeout=max_seconds or None)
511
                        nr_succeeded += 1
512
                    # exclude NotImplementedError, so we can try process() below
513
                    except NotImplementedError:
514
                        raise
515
                    # handle input failures separately
516
                    except FileExistsError as err:
517
                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
518
                            raise err
519
                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
520
                            continue
521
                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
522
                            # too late here, must not happen
523
                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
524
                    # broad coverage of output failures (including TimeoutError)
525
                    except (Exception, TimeoutError) as err:
526
                        # FIXME: add re-usable/actionable logging
527
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
528
                            self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
529
                            raise err
530
                        self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
531
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
532
                            nr_skipped += 1
533
                            continue
534
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
535
                            self._copy_page_file(input_files[0])
536
                            nr_copied += 1
537
                        else:
538
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
539
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
540
541
                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
542
                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
543
                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
544
                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
545
                executor.shutdown()
546
547
            except NotImplementedError:
548
                # fall back to deprecated method
549
                self.process()
550
551
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
552
        """
553
        Copy the given ``input_file`` of the :py:data:`workspace`,
554
        representing one physical page (passed as one opened
555
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
556
        and add it as if it was a processing result.
557
        """
558
        input_pcgts : OcrdPage
559
        assert isinstance(input_file, get_args(OcrdFileType))
560
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
561
        try:
562
            input_pcgts = page_from_file(input_file)
563
        except ValueError as err:
564
            # not PAGE and not an image to generate PAGE for
565
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
566
            return
567
        output_file_id = make_file_id(input_file, self.output_file_grp)
568
        input_pcgts.set_pcGtsId(output_file_id)
569
        self.add_metadata(input_pcgts)
570
        self.workspace.add_file(
571
            file_id=output_file_id,
572
            file_grp=self.output_file_grp,
573
            page_id=input_file.pageId,
574
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
575
            mimetype=MIMETYPE_PAGE,
576
            content=to_xml(input_pcgts),
577
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
578
        )
579
580
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
581
        """
582
        Process the given ``input_files`` of the :py:data:`workspace`,
583
        representing one physical page (passed as one opened
584
        :py:class:`.OcrdFile` per input fileGrp)
585
        under the given :py:data:`.parameter`, and make sure the
586
        results get added accordingly.
587
588
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
589
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
590
        """
591
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
592
        assert isinstance(input_files[0], get_args(OcrdFileType))
593
        page_id = input_files[0].pageId
594
        self._base_logger.info("processing page %s", page_id)
595
        for i, input_file in enumerate(input_files):
596
            assert isinstance(input_file, get_args(OcrdFileType))
597
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
598
            try:
599
                page_ = page_from_file(input_file)
600
                assert isinstance(page_, OcrdPage)
601
                input_pcgts[i] = page_
602
            except ValueError as err:
603
                # not PAGE and not an image to generate PAGE for
604
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
605
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
606
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
607
        for image_result in result.images:
608
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
609
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
610
            if isinstance(image_result.alternative_image, PageType):
611
                # special case: not an alternative image, but replacing the original image
612
                # (this is needed by certain processors when the original's coordinate system
613
                #  cannot or must not be kept)
614
                image_result.alternative_image.set_imageFilename(image_file_path)
615
                image_result.alternative_image.set_imageWidth(image_result.pil.width)
616
                image_result.alternative_image.set_imageHeight(image_result.pil.height)
617
            elif isinstance(image_result.alternative_image, AlternativeImageType):
618
                image_result.alternative_image.set_filename(image_file_path)
619
            else:
620
                raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
621
                                 f"{type(image_result.alternative_image)}")
622
            self.workspace.save_image_file(
623
                image_result.pil,
624
                image_file_id,
625
                self.output_file_grp,
626
                page_id=page_id,
627
                file_path=image_file_path,
628
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
629
            )
630
        result.pcgts.set_pcGtsId(output_file_id)
631
        self.add_metadata(result.pcgts)
632
        self.workspace.add_file(
633
            file_id=output_file_id,
634
            file_grp=self.output_file_grp,
635
            page_id=page_id,
636
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
637
            mimetype=MIMETYPE_PAGE,
638
            content=to_xml(result.pcgts),
639
            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
640
        )
641
642
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
643
        """
644
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
645
        representing one physical page (passed as one parsed
646
        :py:class:`.OcrdPage` per input fileGrp)
647
        under the given :py:data:`.parameter`, and return the
648
        resulting :py:class:`.OcrdPageResult`.
649
650
        Optionally, add to the ``images`` attribute of the resulting
651
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
652
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
653
        ``file_id_suffix`` (used for generating IDs of the saved image) and
654
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
655
        for setting the filename of the saved image).
656
657
        (This contains the main functionality and must be overridden by subclasses,
658
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
659
        """
660
        raise NotImplementedError()
661
662
    def add_metadata(self, pcgts: OcrdPage) -> None:
663
        """
664
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
665
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
666
        """
667
        metadata_obj = pcgts.get_Metadata()
668
        assert metadata_obj is not None
669
        metadata_obj.add_MetadataItem(
670
                MetadataItemType(type_="processingStep",
671
                    name=self.ocrd_tool['steps'][0],
672
                    value=self.ocrd_tool['executable'],
673
                    Labels=[LabelsType(
674
                        externalModel="ocrd-tool",
675
                        externalId="parameters",
676
                        Label=[LabelType(type_=name,
677
                                         value=self.parameter[name])
678
                               for name in self.parameter.keys()]),
679
                            LabelsType(
680
                        externalModel="ocrd-tool",
681
                        externalId="version",
682
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
683
                                         value=self.version),
684
                               LabelType(type_='ocrd/core',
685
                                         value=OCRD_VERSION)])
686
                    ]))
687
688
    def resolve_resource(self, val):
689
        """
690
        Resolve a resource name to an absolute file path with the algorithm in
691
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
692
693
        Args:
694
            val (string): resource value to resolve
695
        """
696
        executable = self.ocrd_tool['executable']
697
        if exists(val):
698
            self._base_logger.debug("Resolved to absolute path %s" % val)
699
            return val
700
        # FIXME: remove once workspace arg / old_pwd is gone:
701
        if hasattr(self, 'old_pwd'):
702
            cwd = self.old_pwd
703
        else:
704
            cwd = getcwd()
705
        ret = [cand for cand in list_resource_candidates(executable, val,
706
                                                         cwd=cwd, moduled=self.moduledir)
707
               if exists(cand)]
708
        if ret:
709
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
710
            return ret[0]
711
        raise ResourceNotFoundError(val, executable)
712
713
    def show_resource(self, val):
714
        """
715
        Resolve a resource name to a file path with the algorithm in
716
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
717
        then print its contents to stdout.
718
719
        Args:
720
            val (string): resource value to show
721
        """
722
        res_fname = self.resolve_resource(val)
723
        fpath = Path(res_fname)
724
        if fpath.is_dir():
725
            with pushd_popd(fpath):
726
                fileobj = io.BytesIO()
727
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
728
                    tarball.add('.')
729
                fileobj.seek(0)
730
                copyfileobj(fileobj, sys.stdout.buffer)
731
        else:
732
            sys.stdout.buffer.write(fpath.read_bytes())
733
734
    def list_all_resources(self):
735
        """
736
        List all resources found in the filesystem and matching content-type by filename suffix
737
        """
738
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
739
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
740
            res = Path(res)
741
            if not '*/*' in mimetypes:
742
                if res.is_dir() and not 'text/directory' in mimetypes:
743
                    continue
744
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
745
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
746
                                             for mime in mimetypes):
747
                    continue
748
            yield res
749
750
    @property
751
    def module(self):
752
        """
753
        The top-level module this processor belongs to.
754
        """
755
        # find shortest prefix path that is not just a namespace package
756
        fqname = ''
757
        for name in self.__module__.split('.'):
758
            if fqname:
759
                fqname += '.'
760
            fqname += name
761
            if getattr(sys.modules[fqname], '__file__', None):
762
                return fqname
763
        # fall-back
764
        return self.__module__
765
766
    @property
767
    def moduledir(self):
768
        """
769
        The filesystem path of the module directory.
770
        """
771
        return resource_filename(self.module, '.')
772
773
    @property
774
    def input_files(self):
775
        """
776
        List the input files (for single-valued :py:attr:`input_file_grp`).
777
778
        For each physical page:
779
780
        - If there is a single PAGE-XML for the page, take it (and forget about all
781
          other files for that page)
782
        - Else if there is a single image file, take it (and forget about all other
783
          files for that page)
784
        - Otherwise raise an error (complaining that only PAGE-XML warrants
785
          having multiple images for a single page)
786
787
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
788
789
        Returns:
790
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
791
        """
792
        if not self.input_file_grp:
793
            raise ValueError("Processor is missing input fileGrp")
794
        ret = self.zip_input_files(mimetype=None, on_error='abort')
795
        if not ret:
796
            return []
797
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
798
        return [tuples[0] for tuples in ret]
799
800
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
801
        """
802
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
803
804
        Processors that expect/need multiple input file groups,
805
        cannot use :py:data:`input_files`. They must align (zip) input files
806
        across pages. This includes the case where not all pages
807
        are equally present in all file groups. It also requires
808
        making a consistent selection if there are multiple files
809
        per page.
810
811
        Following the OCR-D functional model, this function tries to
812
        find a single PAGE file per page, or fall back to a single
813
        image file per page. In either case, multiple matches per page
814
        are an error (see error handling below).
815
        This default behaviour can be changed by using a fixed MIME
816
        type filter via :py:attr:`mimetype`. But still, multiple matching
817
        files per page are an error.
818
819
        Single-page multiple-file errors are handled according to
820
        :py:attr:`on_error`:
821
822
        - if ``skip``, then the page for the respective fileGrp will be
823
          silently skipped (as if there was no match at all)
824
        - if ``first``, then the first matching file for the page will be
825
          silently selected (as if the first was the only match)
826
        - if ``last``, then the last matching file for the page will be
827
          silently selected (as if the last was the only match)
828
        - if ``abort``, then an exception will be raised.
829
830
        Multiple matches for PAGE-XML will always raise an exception.
831
832
        Keyword Args:
833
             require_first (boolean): If true, then skip a page entirely
834
                 whenever it is not available in the first input `fileGrp`.
835
             on_error (string): How to handle multiple file matches per page.
836
             mimetype (string): If not `None`, filter by the specified MIME
837
                 type (literal or regex prefixed by `//`). Otherwise prefer
838
                 PAGE or image.
839
        Returns:
840
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
841
        """
842
        if not self.input_file_grp:
843
            raise ValueError("Processor is missing input fileGrp")
844
845
        ifgs = self.input_file_grp.split(",")
846
        # Iterating over all files repeatedly may seem inefficient at first sight,
847
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
848
        # can actually be much more costly than traversing the ltree.
849
        # This might depend on the number of pages vs number of fileGrps.
850
851
        pages = {}
852
        for i, ifg in enumerate(ifgs):
853
            files_ = sorted(self.workspace.mets.find_all_files(
854
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
855
                                # sort by MIME type so PAGE comes before images
856
                                key=lambda file_: file_.mimetype)
857
            for file_ in files_:
858
                if not file_.pageId:
859
                    # ignore document-global files
860
                    continue
861
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
862
                if ift[i]:
863
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
864
                    # fileGrp has multiple files for this page ID
865
                    if mimetype:
866
                        # filter was active, this must not happen
867
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
868
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
869 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
870
                            ift[i] = None
871
                        elif on_error == 'first':
872
                            pass # keep first match
873
                        elif on_error == 'last':
874
                            ift[i] = file_
875
                        elif on_error == 'abort':
876
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
877
                        else:
878
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
879
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
880
                          file_.mimetype != MIMETYPE_PAGE):
881
                        pass # keep PAGE match
882
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
883
                          file_.mimetype == MIMETYPE_PAGE):
884
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
885
                    else:
886
                        # filter was inactive but no PAGE is in control, this must not happen
887
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
888
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
889 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
890
                            ift[i] = None
891
                        elif on_error == 'first':
892
                            pass # keep first match
893
                        elif on_error == 'last':
894
                            ift[i] = file_
895
                        elif on_error == 'abort':
896
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
897
                        else:
898
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
899
                else:
900
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
901
                    ift[i] = file_
902
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
903
        if self.page_id and not any(pages):
904
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
905
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
906
        ifts = []
907
        for page, ifiles in pages.items():
908
            for i, ifg in enumerate(ifgs):
909
                if not ifiles[i]:
910
                    # could be from non-unique with on_error=skip or from true gap
911
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
912
                    if config.OCRD_MISSING_INPUT == 'abort':
913
                        raise MissingInputFile(ifg, page, mimetype)
914
            if not any(ifiles):
915
                # must be from non-unique with on_error=skip
916
                self._base_logger.warning(f'Found no files for {page} - skipping')
917
                continue
918
            if ifiles[0] or not require_first:
919
                ifts.append(tuple(ifiles))
920
        return ifts
921
922
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
923
    """Generate a string describing the full CLI of this processor including params.
924
925
    Args:
926
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
927
         processor_instance (object, optional): the processor implementation
928
             (for adding any module/class/function docstrings)
929
        subcommand (string): 'worker' or 'server'
930
    """
931
    doc_help = ''
932
    if processor_instance:
933
        module = inspect.getmodule(processor_instance)
934
        if module and module.__doc__:
935
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
936
        if processor_instance.__doc__:
937
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
938
        # Try to find the most concrete docstring among the various methods that an implementation
939
        # could overload, first serving.
940
        # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
941
        # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
942
        for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
943
            instance_method = getattr(processor_instance, method)
944
            superclass_method = getattr(Processor, method)
945
            if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
946
                doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
947
                break
948
        if doc_help:
949
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
950
                                          initial_indent='  > ',
951
                                          subsequent_indent='  > ',
952
                                          preserve_paragraphs=True)
953
    subcommands = '''\
954
    worker      Start a processing worker rather than do local processing
955
    server      Start a processor server rather than do local processing
956
'''
957
958
    processing_worker_options = '''\
959
  --queue                         The RabbitMQ server address in format
960
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
961
                                  [amqp://admin:admin@localhost:5672]
962
  --database                      The MongoDB server address in format
963
                                  "mongodb://{host}:{port}"
964
                                  [mongodb://localhost:27018]
965
  --log-filename                  Filename to redirect STDOUT/STDERR to,
966
                                  if specified.
967
'''
968
969
    processing_server_options = '''\
970
  --address                       The Processor server address in format
971
                                  "{host}:{port}"
972
  --database                      The MongoDB server address in format
973
                                  "mongodb://{host}:{port}"
974
                                  [mongodb://localhost:27018]
975
'''
976
977
    processing_options = '''\
978
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
979
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
980
  -I, --input-file-grp USE        File group(s) used as input
981
  -O, --output-file-grp USE       File group(s) used as output
982
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
983
  --overwrite                     Remove existing output pages/images
984
                                  (with "--page-id", remove only those).
985
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
986
  --debug                         Abort on any errors with full stack trace.
987
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
988
  --profile                       Enable profiling
989
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
990
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
991
                                  or JSON file path
992
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
993
                                  taking precedence over --parameter
994
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
995
                                  If URL starts with http:// start an HTTP server there,
996
                                  otherwise URL is a path to an on-demand-created unix socket
997
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
998
                                  Override log level globally [INFO]
999
  --log-filename LOG-PATH         File to redirect stderr logging to (overriding ocrd_logging.conf).
1000
'''
1001
1002
    information_options = '''\
1003
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
1004
  -L, --list-resources            List names of processor resources
1005
  -J, --dump-json                 Dump tool description as JSON
1006
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
1007
  -h, --help                      Show this message
1008
  -V, --version                   Show version
1009
'''
1010
1011
    parameter_help = ''
1012
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1013
        parameter_help = '  NONE\n'
1014
    else:
1015
        def wrap(s):
1016
            return wrap_text(s, initial_indent=' '*3,
1017
                             subsequent_indent=' '*4,
1018
                             width=72, preserve_paragraphs=True)
1019
        for param_name, param in ocrd_tool['parameters'].items():
1020
            parameter_help += wrap('"%s" [%s%s]' % (
1021
                param_name,
1022
                param['type'],
1023
                ' - REQUIRED' if 'required' in param and param['required'] else
1024
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1025
            parameter_help += '\n ' + wrap(param['description'])
1026
            if 'enum' in param:
1027
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1028
            parameter_help += "\n"
1029
1030
    if not subcommand:
1031
        return f'''\
1032
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1033
1034
  {ocrd_tool['description']}{doc_help}
1035
1036
Subcommands:
1037
{subcommands}
1038
Options for processing:
1039
{processing_options}
1040
Options for information:
1041
{information_options}
1042
Parameters:
1043
{parameter_help}
1044
'''
1045
    elif subcommand == 'worker':
1046
        return f'''\
1047
Usage: {ocrd_tool['executable']} worker [OPTIONS]
1048
1049
  Run {ocrd_tool['executable']} as a processing worker.
1050
1051
  {ocrd_tool['description']}{doc_help}
1052
1053
Options:
1054
{processing_worker_options}
1055
'''
1056
    elif subcommand == 'server':
1057
        return f'''\
1058
Usage: {ocrd_tool['executable']} server [OPTIONS]
1059
1060
  Run {ocrd_tool['executable']} as a processor sever.
1061
1062
  {ocrd_tool['description']}{doc_help}
1063
1064
Options:
1065
{processing_server_options}
1066
'''
1067
    else:
1068
        pass
1069