Passed
Pull Request — master (#1240)
by Konstantin
03:00
created

ocrd.processor.base.Processor.process()   A

Complexity

Conditions 1

Size

Total Lines 13
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 13
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists, join
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import List, Optional, Union, get_args
19
import sys
20
import inspect
21
import tarfile
22
import io
23
from warnings import warn
24
from deprecated import deprecated
25
from requests import HTTPError
26
27
from ocrd.workspace import Workspace
28
from ocrd_models.ocrd_file import OcrdFileType
29
from ocrd.processor.ocrd_page_result import OcrdPageResult
30
from ocrd_utils import (
31
    VERSION as OCRD_VERSION,
32
    MIMETYPE_PAGE,
33
    MIME_TO_EXT,
34
    config,
35
    getLogger,
36
    initLogging,
37
    list_resource_candidates,
38
    pushd_popd,
39
    list_all_resources,
40
    get_processor_resource_types,
41
    resource_filename,
42
    resource_string,
43
    make_file_id,
44
    deprecation_warning
45
)
46
from ocrd_validators import ParameterValidator
47
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
48
from ocrd_modelfactory import page_from_file
49
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
50
51
# XXX imports must remain for backwards-compatibility
52
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
53
54
55
class ResourceNotFoundError(FileNotFoundError):
56
    """
57
    An exception signifying the requested processor resource
58
    cannot be resolved.
59
    """
60
    def __init__(self, name, executable):
61
        self.name = name
62
        self.executable = executable
63
        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
64
                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
65
        super().__init__(self.message)
66
67
class NonUniqueInputFile(ValueError):
68
    """
69
    An exception signifying the specified fileGrp / pageId / mimetype
70
    selector yields multiple PAGE files, or no PAGE files but multiple images,
71
    or multiple files of that mimetype.
72
    """
73
    def __init__(self, fileGrp, pageId, mimetype):
74
        self.fileGrp = fileGrp
75
        self.pageId = pageId
76
        self.mimetype = mimetype
77
        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
78
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
79
        super().__init__(self.message)
80
81
class MissingInputFile(ValueError):
82
    """
83
    An exception signifying the specified fileGrp / pageId / mimetype
84
    selector yields no PAGE files, or no PAGE and no image files,
85
    or no files of that mimetype.
86
    """
87
    def __init__(self, fileGrp, pageId, mimetype):
88
        self.fileGrp = fileGrp
89
        self.pageId = pageId
90
        self.mimetype = mimetype
91
        self.message = (f"Could not find input file for fileGrp {fileGrp} "
92
                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
93
        super().__init__(self.message)
94
95
class Processor():
96
    """
97
    A processor is a tool that implements the uniform OCR-D command-line interface
98
    for run-time data processing. That is, it executes a single workflow step,
99
    or a combination of workflow steps, on the workspace (represented by local METS).
100
    It reads input files for all or requested physical pages of the input fileGrp(s),
101
    and writes output files for them into the output fileGrp(s). It may take 
102
    a number of optional or mandatory parameters.
103
    """
104
105
    @property
106
    def metadata(self) -> dict:
107
        """the ocrd-tool.json dict of the package"""
108
        if hasattr(self, '_metadata'):
109
            return self._metadata
110
        self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
111
        report = OcrdToolValidator.validate(self._metadata)
112
        if not report.is_valid:
113
            # FIXME: remove when bertsky/core#10 is merged
114
            self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
115
            self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.")
116
        return self._metadata
117
118
    @property
119
    def version(self) -> str:
120
        """the version of the package"""
121
        if hasattr(self, '_version'):
122
            return self._version
123
        self._version = self.metadata['version']
124
        return self._version
125
126
    @property
127
    def executable(self) -> str:
128
        """the executable name of this processor tool"""
129
        if hasattr(self, '_executable'):
130
            return self._executable
131
        self._executable = os.path.basename(inspect.stack()[-1].filename)
132
        return self._executable
133
134
    @property
135
    def ocrd_tool(self) -> dict:
136
        """the ocrd-tool.json dict of this processor tool"""
137
        if hasattr(self, '_ocrd_tool'):
138
            return self._ocrd_tool
139
        self._ocrd_tool = self.metadata['tools'][self.executable]
140
        return self._ocrd_tool
141
142
    def __init__(
143
            self,
144
            # FIXME: deprecate in favor of process_workspace(workspace)
145
            workspace : Optional[Workspace],
146
            ocrd_tool=None,
147
            parameter=None,
148
            input_file_grp=None,
149
            output_file_grp=None,
150
            page_id=None,
151
            download_files=config.OCRD_DOWNLOAD_INPUT,
152
            version=None
153
    ):
154
        """
155
        Instantiate, but do not setup (neither for processing nor other usage).
156
        If given, do parse and validate :py:data:`.parameter`.
157
158
        Args:
159
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
160
                 If not ``None``, then `chdir` to that directory.
161
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
162
                 before processing.
163
        Keyword Args:
164
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
165
                 Can be ``None`` even for processing, but then needs to be set before running.
166
             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
167
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
168
                 before processing.
169
             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
170
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
171
                 before processing.
172
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
173
                 (or empty for all pages). \
174
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
175
                 before processing.
176
             download_files (boolean): Whether input files will be downloaded prior to processing, \
177
                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
178
        """
179
        if ocrd_tool is not None:
180
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
181
                                "use or override metadata/executable/ocrd-tool properties instead")
182
            self._ocrd_tool = ocrd_tool
183
            self._executable = ocrd_tool['executable']
184
        if version is not None:
185
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
186
                                "use or override metadata/version properties instead")
187
            self._version = version
188
        if workspace is not None:
189
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
190
                                "is deprecated - pass as argument to process_workspace instead")
191
            self.workspace = workspace
192
            self.old_pwd = getcwd()
193
            os.chdir(self.workspace.directory)
194
        if input_file_grp is not None:
195
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
196
                                "is deprecated - pass as argument to process_workspace instead")
197
            self.input_file_grp = input_file_grp
198
        if output_file_grp is not None:
199
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
200
                                "is deprecated - pass as argument to process_workspace instead")
201
            self.output_file_grp = output_file_grp
202
        if page_id is not None:
203
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
204
                                "is deprecated - pass as argument to process_workspace instead")
205
            self.page_id = page_id or None
206
        self.download = download_files
207
        if parameter is None:
208
            parameter = {}
209
        parameterValidator = ParameterValidator(self.ocrd_tool)
210
211
        report = parameterValidator.validate(parameter)
212
        if not report.is_valid:
213
            raise ValueError("Invalid parameters %s" % report.errors)
214
        self.parameter = parameter
215
        # NOTE: this is the logger to be used by processor implementations,
216
        # `processor.base` default implementations should use
217
        # :py:attr:`self._base_logger`
218
        self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
219
        self._base_logger = getLogger('ocrd.processor.base')
220
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
221
        setattr(self, 'process',
222
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
223
224
    def show_help(self, subcommand=None):
225
        """
226
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
227
        parameters and docstrings.
228
        """
229
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
230
231
    def show_version(self):
232
        """
233
        Print information on this processor's version and OCR-D version.
234
        """
235
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
236
237
    def verify(self):
238
        """
239
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
240
        """
241
        assert self.input_file_grp is not None
242
        assert self.output_file_grp is not None
243
        input_file_grps = self.input_file_grp.split(',')
244
        output_file_grps = self.output_file_grp.split(',')
245
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
246
            if isinstance(spec, int):
247
                if spec > 0:
248
                    assert len(grps) == spec, msg % (len(grps), str(spec))
249
            else:
250
                assert isinstance(spec, list)
251
                minimum = spec[0]
252
                maximum = spec[1]
253
                if minimum > 0:
254
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
255
                if maximum > 0:
256
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
257
        # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
258
        # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
259
        if 'input_file_grp_cardinality' in self.ocrd_tool:
260
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
261
                                        "Unexpected number of input file groups %d vs %s")
262
        if 'output_file_grp_cardinality' in self.ocrd_tool:
263
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
264
                                        "Unexpected number of output file groups %d vs %s")
265
        for input_file_grp in input_file_grps:
266
            assert input_file_grp in self.workspace.mets.file_groups
267
        # keep this for backwards compatibility:
268
        return True
269
270
    def dump_json(self):
271
        """
272
        Print :py:attr:`ocrd_tool` on stdout.
273
        """
274
        print(json.dumps(self.ocrd_tool, indent=True))
275
        return
276
277
    def dump_module_dir(self):
278
        """
279
        Print :py:attr:`moduledir` on stdout.
280
        """
281
        print(self.moduledir)
282
        return
283
284
    def list_resources(self):
285
        """
286
        Find all installed resource files in the search paths and print their path names.
287
        """
288
        for res in self.list_all_resources():
289
            print(res)
290
        return
291
292
    def setup(self) -> None:
293
        """
294
        Prepare the processor for actual data processing,
295
        prior to changing to the workspace directory but
296
        after parsing parameters.
297
298
        (Override this to load models into memory etc.)
299
        """
300
        pass
301
302
    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
303
    def process(self) -> None:
304
        """
305
        Process all files of the :py:data:`workspace`
306
        from the given :py:data:`input_file_grp`
307
        to the given :py:data:`output_file_grp`
308
        for the given :py:data:`page_id` (or all pages)
309
        under the given :py:data:`parameter`.
310
311
        (This contains the main functionality and needs to be
312
        overridden by subclasses.)
313
        """
314
        raise NotImplementedError()
315
316
    def process_workspace(self, workspace: Workspace) -> None:
317
        """
318
        Process all files of the given ``workspace``,
319
        from the given :py:data:`input_file_grp`
320
        to the given :py:data:`output_file_grp`
321
        for the given :py:data:`page_id` (or all pages)
322
        under the given :py:data:`parameter`.
323
324
        (This will iterate over pages and files, calling
325
        :py:meth:`.process_page_file` and handling exceptions.
326
        It should be overridden by subclasses to handle cases
327
        like post-processing or computation across pages.)
328
        """
329
        with pushd_popd(workspace.directory):
330
            self.workspace = workspace
331
            self.verify()
332
            try:
333
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
334
                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
335
                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
336
                    page_id = next(input_file.pageId
337
                                   for input_file in input_file_tuple
338
                                   if input_file)
339
                    self._base_logger.info(f"processing page {page_id}")
340
                    for i, input_file in enumerate(input_file_tuple):
341
                        if input_file is None:
342
                            # file/page not found in this file grp
343
                            continue
344
                        input_files[i] = input_file
345
                        if not self.download:
346
                            continue
347
                        try:
348
                            input_files[i] = self.workspace.download_file(input_file)
349
                        except (ValueError, FileNotFoundError, HTTPError) as e:
350
                            self._base_logger.error(repr(e))
351
                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
352
                    # FIXME: differentiate error cases in various ways:
353
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
354
                    # - transient (I/O or OOM) error → maybe sleep, retry
355
                    # - persistent (data) error → skip / dummy / raise
356
                    try:
357
                        self.process_page_file(*input_files)
358
                    except Exception as err:
359
                        # we have to be broad here, but want to exclude NotImplementedError
360
                        if isinstance(err, NotImplementedError):
361
                            raise err
362
                        if isinstance(err, FileExistsError):
363
                            if config.OCRD_EXISTING_OUTPUT == 'ABORT':
364
                                raise err
365
                            if config.OCRD_EXISTING_OUTPUT == 'SKIP':
366
                                continue
367
                            if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
368
                                # too late here, must not happen
369
                                raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
370
                        # FIXME: re-usable/actionable logging
371
                        self._base_logger.exception(f"Failure on page {page_id}: {err}")
372
                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
373
                            raise err
374
                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
375
                            continue
376
                        if config.OCRD_MISSING_OUTPUT == 'COPY':
377
                            self._copy_page_file(input_files[0])
378
                        else:
379
                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
380
                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
381
            except NotImplementedError:
382
                # fall back to deprecated method
383
                self.process()
384
385
    def _copy_page_file(self, input_file : OcrdFileType) -> None:
386
        """
387
        Copy the given ``input_file`` of the :py:data:`workspace`,
388
        representing one physical page (passed as one opened
389
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
390
        and add it as if it was a processing result.
391
        """
392
        input_pcgts : OcrdPage
393
        assert isinstance(input_file, get_args(OcrdFileType))
394
        self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
395
        try:
396
            input_pcgts = page_from_file(input_file)
397
        except ValueError as err:
398
            # not PAGE and not an image to generate PAGE for
399
            self._base_logger.error(f"non-PAGE input for page {input_file.pageId}: {err}")
400
            return
401
        output_file_id = make_file_id(input_file, self.output_file_grp)
402
        input_pcgts.set_pcGtsId(output_file_id)
403
        self.add_metadata(input_pcgts)
404
        self.workspace.add_file(file_id=output_file_id,
405
                                file_grp=self.output_file_grp,
406
                                page_id=input_file.pageId,
407
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
408
                                mimetype=MIMETYPE_PAGE,
409
                                content=to_xml(input_pcgts),
410
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
411
        )
412
413
    def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
414
        """
415
        Process the given ``input_files`` of the :py:data:`workspace`,
416
        representing one physical page (passed as one opened
417
        :py:class:`.OcrdFile` per input fileGrp)
418
        under the given :py:data:`.parameter`, and make sure the
419
        results get added accordingly.
420
421
        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
422
        to handle cases like multiple output fileGrps, non-PAGE input etc.)
423
        """
424
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
425
        assert isinstance(input_files[0], get_args(OcrdFileType))
426
        page_id = input_files[0].pageId
427
        for i, input_file in enumerate(input_files):
428
            assert isinstance(input_file, get_args(OcrdFileType))
429
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
430
            try:
431
                page_ = page_from_file(input_file)
432
                assert isinstance(page_, OcrdPage)
433
                input_pcgts[i] = page_
434
            except ValueError as err:
435
                # not PAGE and not an image to generate PAGE for
436
                self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
437
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
438
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
439
        for image_result in result.images:
440
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
441
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
442
            image_result.alternative_image.set_filename(image_file_path)
443
            self.workspace.save_image_file(
444
                image_result.pil,
445
                image_file_id,
446
                self.output_file_grp,
447
                page_id=page_id,
448
                file_path=image_file_path,
449
                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
450
            )
451
        result.pcgts.set_pcGtsId(output_file_id)
452
        self.add_metadata(result.pcgts)
453
        self.workspace.add_file(file_id=output_file_id,
454
                                file_grp=self.output_file_grp,
455
                                page_id=page_id,
456
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
457
                                mimetype=MIMETYPE_PAGE,
458
                                content=to_xml(result.pcgts),
459
                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
460
        )
461
462
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
463
        """
464
        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
465
        representing one physical page (passed as one parsed
466
        :py:class:`.OcrdPage` per input fileGrp)
467
        under the given :py:data:`.parameter`, and return the
468
        resulting :py:class:`.OcrdPageResult`.
469
470
        Optionally, add to the ``images`` attribute of the resulting
471
        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
472
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
473
        ``file_id_suffix`` (used for generating IDs of the saved image) and
474
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
475
        for setting the filename of the saved image).
476
477
        (This contains the main functionality and must be overridden by subclasses,
478
        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
479
        """
480
        raise NotImplementedError()
481
482
    def add_metadata(self, pcgts: OcrdPage) -> None:
483
        """
484
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
485
        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
486
        """
487
        metadata_obj = pcgts.get_Metadata()
488
        assert metadata_obj is not None
489
        metadata_obj.add_MetadataItem(
490
                MetadataItemType(type_="processingStep",
491
                    name=self.ocrd_tool['steps'][0],
492
                    value=self.ocrd_tool['executable'],
493
                    Labels=[LabelsType(
494
                        externalModel="ocrd-tool",
495
                        externalId="parameters",
496
                        Label=[LabelType(type_=name,
497
                                         value=self.parameter[name])
498
                               for name in self.parameter.keys()]),
499
                            LabelsType(
500
                        externalModel="ocrd-tool",
501
                        externalId="version",
502
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
503
                                         value=self.version),
504
                               LabelType(type_='ocrd/core',
505
                                         value=OCRD_VERSION)])
506
                    ]))
507
508
    def resolve_resource(self, val):
509
        """
510
        Resolve a resource name to an absolute file path with the algorithm in
511
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
512
513
        Args:
514
            val (string): resource value to resolve
515
        """
516
        executable = self.ocrd_tool['executable']
517
        if exists(val):
518
            self._base_logger.debug("Resolved to absolute path %s" % val)
519
            return val
520
        # FIXME: remove once workspace arg / old_pwd is gone:
521
        if hasattr(self, 'old_pwd'):
522
            cwd = self.old_pwd
523
        else:
524
            cwd = getcwd()
525
        ret = [cand for cand in list_resource_candidates(executable, val,
526
                                                         cwd=cwd, moduled=self.moduledir)
527
               if exists(cand)]
528
        if ret:
529
            self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
530
            return ret[0]
531
        raise ResourceNotFoundError(val, executable)
532
533
    def show_resource(self, val):
534
        """
535
        Resolve a resource name to a file path with the algorithm in
536
        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
537
        then print its contents to stdout.
538
539
        Args:
540
            val (string): resource value to show
541
        """
542
        res_fname = self.resolve_resource(val)
543
        fpath = Path(res_fname)
544
        if fpath.is_dir():
545
            with pushd_popd(fpath):
546
                fileobj = io.BytesIO()
547
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
548
                    tarball.add('.')
549
                fileobj.seek(0)
550
                copyfileobj(fileobj, sys.stdout.buffer)
551
        else:
552
            sys.stdout.buffer.write(fpath.read_bytes())
553
554
    def list_all_resources(self):
555
        """
556
        List all resources found in the filesystem and matching content-type by filename suffix
557
        """
558
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
559
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
560
            res = Path(res)
561
            if not '*/*' in mimetypes:
562
                if res.is_dir() and not 'text/directory' in mimetypes:
563
                    continue
564
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
565
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
566
                                             for mime in mimetypes):
567
                    continue
568
            yield res
569
570
    @property
571
    def module(self):
572
        """
573
        The top-level module this processor belongs to.
574
        """
575
        # find shortest prefix path that is not just a namespace package
576
        fqname = ''
577
        for name in self.__module__.split('.'):
578
            if fqname:
579
                fqname += '.'
580
            fqname += name
581
            if getattr(sys.modules[fqname], '__file__', None):
582
                return fqname
583
        # fall-back
584
        return self.__module__
585
586
    @property
587
    def moduledir(self):
588
        """
589
        The filesystem path of the module directory.
590
        """
591
        return resource_filename(self.module, '.')
592
593
    @property
594
    def input_files(self):
595
        """
596
        List the input files (for single-valued :py:attr:`input_file_grp`).
597
598
        For each physical page:
599
600
        - If there is a single PAGE-XML for the page, take it (and forget about all
601
          other files for that page)
602
        - Else if there is a single image file, take it (and forget about all other
603
          files for that page)
604
        - Otherwise raise an error (complaining that only PAGE-XML warrants
605
          having multiple images for a single page)
606
607
        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
608
609
        Returns:
610
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
611
        """
612
        if not self.input_file_grp:
613
            raise ValueError("Processor is missing input fileGrp")
614
        ret = self.zip_input_files(mimetype=None, on_error='abort')
615
        if not ret:
616
            return []
617
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
618
        return [tuples[0] for tuples in ret]
619
620
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
621
        """
622
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
623
624
        Processors that expect/need multiple input file groups,
625
        cannot use :py:data:`input_files`. They must align (zip) input files
626
        across pages. This includes the case where not all pages
627
        are equally present in all file groups. It also requires
628
        making a consistent selection if there are multiple files
629
        per page.
630
631
        Following the OCR-D functional model, this function tries to
632
        find a single PAGE file per page, or fall back to a single
633
        image file per page. In either case, multiple matches per page
634
        are an error (see error handling below).
635
        This default behaviour can be changed by using a fixed MIME
636
        type filter via :py:attr:`mimetype`. But still, multiple matching
637
        files per page are an error.
638
639
        Single-page multiple-file errors are handled according to
640
        :py:attr:`on_error`:
641
642
        - if ``skip``, then the page for the respective fileGrp will be
643
          silently skipped (as if there was no match at all)
644
        - if ``first``, then the first matching file for the page will be
645
          silently selected (as if the first was the only match)
646
        - if ``last``, then the last matching file for the page will be
647
          silently selected (as if the last was the only match)
648
        - if ``abort``, then an exception will be raised.
649
650
        Multiple matches for PAGE-XML will always raise an exception.
651
652
        Keyword Args:
653
             require_first (boolean): If true, then skip a page entirely
654
                 whenever it is not available in the first input `fileGrp`.
655
             on_error (string): How to handle multiple file matches per page.
656
             mimetype (string): If not `None`, filter by the specified MIME
657
                 type (literal or regex prefixed by `//`). Otherwise prefer
658
                 PAGE or image.
659
        Returns:
660
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
661
        """
662
        if not self.input_file_grp:
663
            raise ValueError("Processor is missing input fileGrp")
664
665
        ifgs = self.input_file_grp.split(",")
666
        # Iterating over all files repeatedly may seem inefficient at first sight,
667
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
668
        # can actually be much more costly than traversing the ltree.
669
        # This might depend on the number of pages vs number of fileGrps.
670
671
        pages = dict()
672
        for i, ifg in enumerate(ifgs):
673
            files_ = sorted(self.workspace.mets.find_all_files(
674
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
675
                                # sort by MIME type so PAGE comes before images
676
                                key=lambda file_: file_.mimetype)
677
            for file_ in files_:
678
                if not file_.pageId:
679
                    # ignore document-global files
680
                    continue
681
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
682
                if ift[i]:
683
                    self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
684
                    # fileGrp has multiple files for this page ID
685
                    if mimetype:
686
                        # filter was active, this must not happen
687
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
688
                                                  f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
689 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
690
                            ift[i] = None
691
                        elif on_error == 'first':
692
                            pass # keep first match
693
                        elif on_error == 'last':
694
                            ift[i] = file_
695
                        elif on_error == 'abort':
696
                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
697
                        else:
698
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
699
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
700
                          file_.mimetype != MIMETYPE_PAGE):
701
                        pass # keep PAGE match
702
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
703
                          file_.mimetype == MIMETYPE_PAGE):
704
                        raise NonUniqueInputFile(ifg, file_.pageId, None)
705
                    else:
706
                        # filter was inactive but no PAGE is in control, this must not happen
707
                        self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
708
                                                  f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
709 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
710
                            ift[i] = None
711
                        elif on_error == 'first':
712
                            pass # keep first match
713
                        elif on_error == 'last':
714
                            ift[i] = file_
715
                        elif on_error == 'abort':
716
                            raise NonUniqueInputFile(ifg, file_.pageId, None)
717
                        else:
718
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
719
                else:
720
                    self._base_logger.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
721
                    ift[i] = file_
722
        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
723
        if self.page_id and not any(pages):
724
            self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
725
                                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
726
        ifts = list()
727
        for page, ifiles in pages.items():
728
            for i, ifg in enumerate(ifgs):
729
                if not ifiles[i]:
730
                    # could be from non-unique with on_error=skip or from true gap
731
                    self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
732
                    if config.OCRD_MISSING_INPUT == 'abort':
733
                        raise MissingInputFile(ifg, page, mimetype)
734
            if not any(ifiles):
735
                # must be from non-unique with on_error=skip
736
                self._base_logger.warning(f'Found no files for {page} - skipping')
737
                continue
738
            if ifiles[0] or not require_first:
739
                ifts.append(tuple(ifiles))
740
        return ifts
741