Passed
Pull Request — master (#1240)
by
unknown
02:46
created

ocrd.processor.base.Processor.show_help()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import Optional
19
import sys
20
import inspect
21
import tarfile
22
import io
23
from deprecated import deprecated
24
25
from ocrd.workspace import Workspace
26
from ocrd_utils import (
27
    VERSION as OCRD_VERSION,
28
    MIMETYPE_PAGE,
29
    MIME_TO_EXT,
30
    getLogger,
31
    initLogging,
32
    list_resource_candidates,
33
    pushd_popd,
34
    list_all_resources,
35
    get_processor_resource_types,
36
    resource_filename,
37
    resource_string,
38
    make_file_id,
39
    deprecation_warning
40
)
41
from ocrd_validators import ParameterValidator
42
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
43
from ocrd_modelfactory import page_from_file
44
45
# XXX imports must remain for backwards-compatibility
46
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
47
48
class ResourceNotFoundError(FileNotFoundError):
49
    """
50
    An exception signifying the requested processor resource
51
    cannot be resolved.
52
    """
53
    def __init__(self, name, executable):
54
        self.name = name
55
        self.executable = executable
56
        self.message = "Could not find resource '%s' for executable '%s'. " \
57
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
58
                       % (name, executable, executable, name)
59
        super().__init__(self.message)
60
61
class Processor():
62
    """
63
    A processor is a tool that implements the uniform OCR-D command-line interface
64
    for run-time data processing. That is, it executes a single workflow step,
65
    or a combination of workflow steps, on the workspace (represented by local METS).
66
    It reads input files for all or requested physical pages of the input fileGrp(s),
67
    and writes output files for them into the output fileGrp(s). It may take 
68
    a number of optional or mandatory parameters.
69
    """
70
71
    @property
72
    def metadata(self):
73
        """the ocrd-tool.json dict of the package"""
74
        if hasattr(self, '_metadata'):
75
            return self._metadata
76
        self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
77
        return self._metadata
78
79
    @property
80
    def version(self):
81
        """the version of the package"""
82
        if hasattr(self, '_version'):
83
            return self._version
84
        self._version = self.metadata['version']
85
        return self._version
86
87
    @property
88
    def executable(self):
89
        """the executable name of this processor tool"""
90
        if hasattr(self, '_executable'):
91
            return self._executable
92
        self._executable = os.path.basename(inspect.stack()[-1].filename)
93
        return self._executable
94
95
    @property
96
    def ocrd_tool(self):
97
        """the ocrd-tool.json dict of this processor tool"""
98
        if hasattr(self, '_ocrd_tool'):
99
            return self._ocrd_tool
100
        self._ocrd_tool = self.metadata['tools'][self.executable]
101
        return self._ocrd_tool
102
103
    def __init__(
104
            self,
105
            # FIXME: deprecate in favor of process_workspace(workspace)
106
            workspace : Optional[Workspace],
107
            ocrd_tool=None,
108
            parameter=None,
109
            input_file_grp=None,
110
            output_file_grp=None,
111
            page_id=None,
112
            download_files=True,
113
            version=None
114
    ):
115
        """
116
        Instantiate, but do not process. Unless ``list_resources`` or
117
        ``show_resource`` or ``show_help`` or ``show_version`` or
118
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
119
        (parsing and validating parameters, entering the workspace directory).
120
121
        Args:
122
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
123
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
124
                 before processing.
125
        Keyword Args:
126
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
127
                 Can be ``None`` even for processing, but then needs to be set before running.
128
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \
129
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
130
                 before processing.
131
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \
132
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
133
                 before processing.
134
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
135
                 (or empty for all pages). \
136
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
137
                 before processing.
138
             download_files (boolean): Whether input files will be downloaded prior to processing.
139
        """
140
        if ocrd_tool is not None:
141
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
142
                                "use or override metadata/executable/ocrd-tool properties instead")
143
            self._ocrd_tool = ocrd_tool
144
            self._executable = ocrd_tool['executable']
145
        if version is not None:
146
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
147
                                "use or override metadata/version properties instead")
148
            self._version = version
149
        if workspace is not None:
150
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
151
                                "is deprecated - pass as argument to process_workspace instead")
152
            self.workspace = workspace
153
            self.old_pwd = getcwd()
154
            os.chdir(self.workspace.directory)
155
        if input_file_grp is not None:
156
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
157
                                "is deprecated - pass as argument to process_workspace instead")
158
            self.input_file_grp = input_file_grp
159
        if output_file_grp is not None:
160
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
161
                                "is deprecated - pass as argument to process_workspace instead")
162
            self.output_file_grp = output_file_grp
163
        if page_id is not None:
164
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
165
                                "is deprecated - pass as argument to process_workspace instead")
166
            self.page_id = page_id or None
167
        self.download = download_files
168
        if parameter is None:
169
            parameter = {}
170
        parameterValidator = ParameterValidator(self.ocrd_tool)
171
172
        report = parameterValidator.validate(parameter)
173
        if not report.is_valid:
174
            raise ValueError("Invalid parameters %s" % report.errors)
175
        self.parameter = parameter
176
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
177
        setattr(self, 'process',
178
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
179
180
    def show_help(self, subcommand=None):
181
        """
182
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
183
        parameters and docstrings.
184
        """
185
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
186
187
    def show_version(self):
188
        """
189
        Print information on this processor's version and OCR-D version.
190
        """
191
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
192
193
    def verify(self):
194
        """
195
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
196
        """
197
        assert self.input_file_grp is not None
198
        assert self.output_file_grp is not None
199
        input_file_grps = self.input_file_grp.split(',')
200
        output_file_grps = self.output_file_grp.split(',')
201
        def assert_file_grp_cardinality(grps, spec, msg):
202
            if isinstance(spec, int) and spec > 0:
203
                assert len(grps) == spec, msg % (len(grps), str(spec))
204
            else:
205
                minimum = spec[0]
206
                maximum = spec[1]
207
                if minimum > 0:
208
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
209
                if maximum > 0:
210
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
211
        # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
212
        # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
213
        if 'input_file_grp_cardinality' in self.ocrd_tool:
214
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
215
                                        "Unexpected number of input file groups %d vs %s")
216
        if 'output_file_grp_cardinality' in self.ocrd_tool:
217
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
218
                                        "Unexpected number of output file groups %d vs %s")
219
        for input_file_grp in input_file_grps:
220
            assert input_file_grp in self.workspace.mets.file_groups
221
        # keep this for backwards compatibility:
222
        return True
223
224
    def dump_json(self):
225
        """
226
        Print :py:attr:`ocrd_tool` on stdout.
227
        """
228
        print(json.dumps(self.ocrd_tool, indent=True))
229
        return
230
231
    def dump_module_dir(self):
232
        """
233
        Print :py:attr:`moduledir` on stdout.
234
        """
235
        print(self.moduledir)
236
        return
237
238
    def list_resources(self):
239
        """
240
        Find all installed resource files in the search paths and print their path names.
241
        """
242
        for res in self.list_all_resources():
243
            print(res)
244
        return
245
246
    def setup(self) -> None:
247
        """
248
        Prepare the processor for actual data processing,
249
        prior to changing to the workspace directory but
250
        after parsing parameters.
251
252
        (Override this to load models into memory etc.)
253
        """
254
        pass
255
256
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
257
    def process(self) -> None:
258
        """
259
        Process all files of the :py:attr:`workspace`
260
        from the given :py:attr:`input_file_grp`
261
        to the given :py:attr:`output_file_grp`
262
        for the given :py:attr:`page_id` (or all pages)
263
        under the given :py:attr:`parameter`.
264
265
        (This contains the main functionality and needs to be overridden by subclasses.)
266
        """
267
        raise NotImplementedError()
268
269
    def process_workspace(self, workspace: Workspace) -> None:
270
        """
271
        Process all files of the given ``workspace``,
272
        from the given :py:attr:`input_file_grp`
273
        to the given :py:attr:`output_file_grp`
274
        for the given :py:attr:`page_id` (or all pages)
275
        under the given :py:attr:`parameter`.
276
277
        (This will iterate over pages and files, calling
278
        :py:meth:`process_page`, handling exceptions.)
279
        """
280
        log = getLogger('ocrd.processor.base')
281
        with pushd_popd(workspace.directory):
282
            self.workspace = workspace
283
            self.verify()
284
            try:
285
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
286
                for input_file_tuple in self.zip_input_files(on_error='abort'):
287
                    # FIXME: add error handling by catching exceptions in various ways (#579)
288
                    # for example:
289
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
290
                    # - transient (I/O or OOM) error → maybe sleep, retry
291
                    # - persistent (data) error → skip / dummy / raise
292
                    input_files = [None] * len(input_file_tuple)
293
                    for i, input_file in enumerate(input_file_tuple):
294
                        if i == 0:
295
                            log.info("processing page %s", input_file.pageId)
296
                        elif input_file is None:
297
                            # file/page not found in this file grp
298
                            continue
299
                        input_files[i] = input_file
300
                        if not self.download:
301
                            continue
302
                        try:
303
                            input_files[i] = self.workspace.download_file(input_file)
304
                        except ValueError as e:
305
                            log.error(repr(e))
306
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
307
                    self.process_page_file(*input_files)
308
            except NotImplementedError:
309
                # fall back to deprecated method
310
                self.process()
311
312
    def process_page_file(self, *input_files) -> None:
313
        """
314
        Process the given ``input_files`` of the :py:attr:`workspace`,
315
        representing one physical page (passed as one opened
316
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
317
        under the given :py:attr:`parameter`, and make sure the
318
        results get added accordingly.
319
320
        (This uses process_page_pcgts, but can be overridden by subclasses
321
        to handle cases like multiple fileGrps, non-PAGE input etc.)
322
        """
323
        log = getLogger('ocrd.processor.base')
324
        input_pcgts = [None] * len(input_files)
325
        page_id = input_files[0].pageId
326
        for i, input_file in enumerate(input_files):
327
            # FIXME: what about non-PAGE input like image or JSON ???
328
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
329
            try:
330
                input_pcgts[i] = page_from_file(input_file)
331
            except ValueError as e:
332
                log.info("non-PAGE input for page %s: %s", page_id, e)
333
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
334
        output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
335
        if isinstance(output_pcgts, (list, tuple)):
336
            output_images = output_pcgts[1:]
337
            output_pcgts = output_pcgts[0]
338
            for output_image_pil, output_image_id, output_image_path in output_images:
339
                self.workspace.save_image_file(
340
                    output_image_pil,
341
                    output_image_id,
342
                    self.output_file_grp,
343
                    page_id=page_id,
344
                    file_path=output_image_path)
345
        output_pcgts.set_pcGtsId(output_file_id)
346
        self.add_metadata(output_pcgts)
347
        # FIXME: what about non-PAGE output like JSON ???
348
        self.workspace.add_file(file_id=output_file_id,
349
                                file_grp=self.output_file_grp,
350
                                page_id=page_id,
351
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
352
                                mimetype=MIMETYPE_PAGE,
353
                                content=to_xml(output_pcgts))
354
355
    def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage:
356
        """
357
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
358
        representing one physical page (passed as one parsed
359
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
360
        under the given :py:attr:`parameter`, and return the
361
        resulting :py:class:`~ocrd_models.OcrdPage`.
362
363
        Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
364
        and one or more lists or tuples of :py:class:`PIL.Image` (image data),
365
        :py:class:str (file ID) and :py:class:str (file path) of derived images
366
        to be annotated along with the resulting PAGE file.
367
368
        (This contains the main functionality and must be overridden by subclasses.)
369
        """
370
        raise NotImplementedError()
371
372
    def add_metadata(self, pcgts: OcrdPage) -> None:
373
        """
374
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
375
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
376
        """
377
        pcgts.get_Metadata().add_MetadataItem(
378
                MetadataItemType(type_="processingStep",
379
                    name=self.ocrd_tool['steps'][0],
380
                    value=self.ocrd_tool['executable'],
381
                    Labels=[LabelsType(
382
                        externalModel="ocrd-tool",
383
                        externalId="parameters",
384
                        Label=[LabelType(type_=name,
385
                                         value=self.parameter[name])
386
                               for name in self.parameter.keys()]),
387
                            LabelsType(
388
                        externalModel="ocrd-tool",
389
                        externalId="version",
390
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
391
                                         value=self.version),
392
                               LabelType(type_='ocrd/core',
393
                                         value=OCRD_VERSION)])
394
                    ]))
395
396
    def resolve_resource(self, val):
397
        """
398
        Resolve a resource name to an absolute file path with the algorithm in
399
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
400
401
        Args:
402
            val (string): resource value to resolve
403
        """
404
        initLogging()
405
        executable = self.ocrd_tool['executable']
406
        log = getLogger('ocrd.processor.base')
407
        if exists(val):
408
            log.debug("Resolved to absolute path %s" % val)
409
            return val
410
        if hasattr(self, 'old_pwd'):
411
            cwd = self.old_pwd
412
        else:
413
            cwd = getcwd()
414
        ret = [cand for cand in list_resource_candidates(executable, val,
415
                                                         cwd=cwd, moduled=self.moduledir)
416
               if exists(cand)]
417
        if ret:
418
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
419
            return ret[0]
420
        raise ResourceNotFoundError(val, executable)
421
422
    def show_resource(self, val):
423
        """
424
        Resolve a resource name to a file path with the algorithm in
425
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters,
426
        then print its contents to stdout.
427
428
        Args:
429
            val (string): resource value to show
430
        """
431
432
        res_fname = self.resolve_resource(val)
433
        fpath = Path(res_fname)
434
        if fpath.is_dir():
435
            with pushd_popd(fpath):
436
                fileobj = io.BytesIO()
437
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
438
                    tarball.add('.')
439
                fileobj.seek(0)
440
                copyfileobj(fileobj, sys.stdout.buffer)
441
        else:
442
            sys.stdout.buffer.write(fpath.read_bytes())
443
444
    def list_all_resources(self):
445
        """
446
        List all resources found in the filesystem and matching content-type by filename suffix
447
        """
448
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
449
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
450
            res = Path(res)
451
            if not '*/*' in mimetypes:
452
                if res.is_dir() and not 'text/directory' in mimetypes:
453
                    continue
454
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
455
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
456
                                             for mime in mimetypes):
457
                    continue
458
            yield res
459
460
    @property
461
    def module(self):
462
        """
463
        The top-level module this processor belongs to.
464
        """
465
        # find shortest prefix path that is not just a namespace package
466
        fqname = ''
467
        for name in self.__module__.split('.'):
468
            if fqname:
469
                fqname += '.'
470
            fqname += name
471
            if getattr(sys.modules[fqname], '__file__', None):
472
                return fqname
473
        # fall-back
474
        return self.__module__
475
476
    @property
477
    def moduledir(self):
478
        """
479
        The filesystem path of the module directory.
480
        """
481
        return resource_filename(self.module, '.')
482
483
    @property
484
    def input_files(self):
485
        """
486
        List the input files (for single-valued :py:attr:`input_file_grp`).
487
488
        For each physical page:
489
490
        - If there is a single PAGE-XML for the page, take it (and forget about all
491
          other files for that page)
492
        - Else if there is a single image file, take it (and forget about all other
493
          files for that page)
494
        - Otherwise raise an error (complaining that only PAGE-XML warrants
495
          having multiple images for a single page)
496
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
497
498
        Returns:
499
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
500
        """
501
        if not self.input_file_grp:
502
            raise ValueError("Processor is missing input fileGrp")
503
        ret = self.zip_input_files(mimetype=None, on_error='abort')
504
        if not ret:
505
            return []
506
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
507
        return [tuples[0] for tuples in ret]
508
509
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
510
        """
511
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
512
513
        Processors that expect/need multiple input file groups,
514
        cannot use :py:data:`input_files`. They must align (zip) input files
515
        across pages. This includes the case where not all pages
516
        are equally present in all file groups. It also requires
517
        making a consistent selection if there are multiple files
518
        per page.
519
520
        Following the OCR-D functional model, this function tries to
521
        find a single PAGE file per page, or fall back to a single
522
        image file per page. In either case, multiple matches per page
523
        are an error (see error handling below).
524
        This default behaviour can be changed by using a fixed MIME
525
        type filter via :py:attr:`mimetype`. But still, multiple matching
526
        files per page are an error.
527
528
        Single-page multiple-file errors are handled according to
529
        :py:attr:`on_error`:
530
531
        - if ``skip``, then the page for the respective fileGrp will be
532
          silently skipped (as if there was no match at all)
533
        - if ``first``, then the first matching file for the page will be
534
          silently selected (as if the first was the only match)
535
        - if ``last``, then the last matching file for the page will be
536
          silently selected (as if the last was the only match)
537
        - if ``abort``, then an exception will be raised.
538
        Multiple matches for PAGE-XML will always raise an exception.
539
540
        Keyword Args:
541
             require_first (boolean): If true, then skip a page entirely
542
                 whenever it is not available in the first input `fileGrp`.
543
             mimetype (string): If not `None`, filter by the specified MIME
544
                 type (literal or regex prefixed by `//`). Otherwise prefer
545
                 PAGE or image.
546
        Returns:
547
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
548
        """
549
        if not self.input_file_grp:
550
            raise ValueError("Processor is missing input fileGrp")
551
552
        LOG = getLogger('ocrd.processor.base')
553
        ifgs = self.input_file_grp.split(",")
554
        # Iterating over all files repeatedly may seem inefficient at first sight,
555
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
556
        # can actually be much more costly than traversing the ltree.
557
        # This might depend on the number of pages vs number of fileGrps.
558
559
        pages = dict()
560
        for i, ifg in enumerate(ifgs):
561
            files_ = sorted(self.workspace.mets.find_all_files(
562
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
563
                                # sort by MIME type so PAGE comes before images
564
                                key=lambda file_: file_.mimetype)
565
            # Warn if no files found but pageId was specified because that
566
            # might be because of invalid page_id (range)
567
            if self.page_id and not files_:
568
                msg = (f"Could not find any files for --page-id {self.page_id} - "
569
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
570
                if on_error == 'abort':
571
                    raise ValueError(msg)
572
                LOG.warning(msg)
573
            for file_ in files_:
574
                if not file_.pageId:
575
                    continue
576
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
577
                if ift[i]:
578
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
579
                    # fileGrp has multiple files for this page ID
580
                    if mimetype:
581
                        # filter was active, this must not happen
582 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
583
                            ift[i] = None
584
                        elif on_error == 'first':
585
                            pass # keep first match
586
                        elif on_error == 'last':
587
                            ift[i] = file_
588
                        elif on_error == 'abort':
589
                            raise ValueError(
590
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
591
                                    mimetype, file_.pageId, ifg))
592
                        else:
593
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
594
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
595
                          file_.mimetype != MIMETYPE_PAGE):
596
                        pass # keep PAGE match
597
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
598
                          file_.mimetype == MIMETYPE_PAGE):
599
                        raise ValueError(
600
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
601
                                file_.pageId, ifg))
602
                    else:
603
                        # filter was inactive but no PAGE is in control, this must not happen
604 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
605
                            ift[i] = None
606
                        elif on_error == 'first':
607
                            pass # keep first match
608
                        elif on_error == 'last':
609
                            ift[i] = file_
610
                        elif on_error == 'abort':
611
                            raise ValueError(
612
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
613
                                    file_.pageId, ifg))
614
                        else:
615
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
616
                else:
617
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
618
                    ift[i] = file_
619
        ifts = list()
620
        for page, ifiles in pages.items():
621
            for i, ifg in enumerate(ifgs):
622
                if not ifiles[i]:
623
                    # other fallback options?
624
                    LOG.error('found no page %s in file group %s',
625
                              page, ifg)
626
            if ifiles[0] or not require_first:
627
                ifts.append(tuple(ifiles))
628
        return ifts
629