Passed
Pull Request — master (#1240)
by
unknown
02:52
created

ocrd.processor.base.Processor.version()   A

Complexity

Conditions 2

Size

Total Lines 7
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 7
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import Optional
19
import sys
20
import inspect
21
import tarfile
22
import io
23
from deprecated import deprecated
24
25
from ocrd.workspace import Workspace
26
from ocrd_utils import (
27
    VERSION as OCRD_VERSION,
28
    MIMETYPE_PAGE,
29
    MIME_TO_EXT,
30
    getLogger,
31
    initLogging,
32
    list_resource_candidates,
33
    pushd_popd,
34
    list_all_resources,
35
    get_processor_resource_types,
36
    resource_filename,
37
    resource_string,
38
    make_file_id,
39
    deprecation_warning
40
)
41
from ocrd_validators import ParameterValidator
42
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
43
from ocrd_modelfactory import page_from_file
44
45
# XXX imports must remain for backwards-compatibility
46
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
47
48
class ResourceNotFoundError(FileNotFoundError):
49
    """
50
    An exception signifying the requested processor resource
51
    cannot be resolved.
52
    """
53
    def __init__(self, name, executable):
54
        self.name = name
55
        self.executable = executable
56
        self.message = "Could not find resource '%s' for executable '%s'. " \
57
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
58
                       % (name, executable, executable, name)
59
        super().__init__(self.message)
60
61
class Processor():
62
    """
63
    A processor is a tool that implements the uniform OCR-D command-line interface
64
    for run-time data processing. That is, it executes a single workflow step,
65
    or a combination of workflow steps, on the workspace (represented by local METS).
66
    It reads input files for all or requested physical pages of the input fileGrp(s),
67
    and writes output files for them into the output fileGrp(s). It may take 
68
    a number of optional or mandatory parameters.
69
    """
70
71
    @property
72
    def metadata(self):
73
        """the ocrd-tool.json dict of the package"""
74
        if hasattr(self, '_metadata'):
75
            return self._metadata
76
        self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
77
        return self._metadata
78
79
    @property
80
    def version(self):
81
        """the version of the package"""
82
        if hasattr(self, '_version'):
83
            return self._version
84
        self._version = self.metadata['version']
85
        return self._version
86
87
    @property
88
    def executable(self):
89
        """the executable name of this processor tool"""
90
        if hasattr(self, '_executable'):
91
            return self._executable
92
        self._executable = os.path.basename(inspect.stack()[-1].filename)
93
        return self._executable
94
95
    @property
96
    def ocrd_tool(self):
97
        """the ocrd-tool.json dict of this processor tool"""
98
        if hasattr(self, '_ocrd_tool'):
99
            return self._ocrd_tool
100
        self._ocrd_tool = self.metadata['tools'][self.executable]
101
        return self._ocrd_tool
102
103
    def __init__(
104
            self,
105
            # FIXME: deprecate in favor of process_workspace(workspace)
106
            workspace : Optional[Workspace],
107
            ocrd_tool=None,
108
            parameter=None,
109
            input_file_grp=None,
110
            output_file_grp=None,
111
            page_id=None,
112
            download_files=True,
113
            # FIXME: deprecate all the following in favor of respective methods
114
            resolve_resource=None,
115
            show_resource=None,
116
            list_resources=False,
117
            show_help=False,
118
            subcommand=None,
119
            show_version=False,
120
            dump_json=False,
121
            dump_module_dir=False,
122
            version=None
123
    ):
124
        """
125
        Instantiate, but do not process. Unless ``list_resources`` or
126
        ``show_resource`` or ``show_help`` or ``show_version`` or
127
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
128
        (parsing and validating parameters, entering the workspace directory).
129
130
        Args:
131
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
132
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
133
                 but then needs to be set before running.
134
        Keyword Args:
135
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
136
                 Can be ``None`` even for processing, but then needs to be set before running.
137
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
138
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
139
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
140
                 (or empty for all pages).
141
             download_files (boolean): Whether input files will be downloaded prior to processing.
142
             resolve_resource (string): If not ``None``, then instead of processing, resolve \
143
                 given resource by name and print its full path to stdout.
144
             show_resource (string): If not ``None``, then instead of processing, resolve \
145
                 given resource by name and print its contents to stdout.
146
             list_resources (boolean): If true, then instead of processing, find all installed \
147
                 resource files in the search paths and print their path names.
148
             show_help (boolean): If true, then instead of processing, print a usage description \
149
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
150
                 docstrings.
151
             subcommand (string): 'worker' or 'server', only used here for the right --help output
152
             show_version (boolean): If true, then instead of processing, print information on \
153
                 this processor's version and OCR-D version. Exit afterwards.
154
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
155
                 on stdout.
156
             dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
157
                 on stdout.
158
        """
159
        if ocrd_tool is not None:
160
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
161
                                "use or override metadata/executable/ocrd-tool properties instead")
162
            self._ocrd_tool = ocrd_tool
163
            self._executable = ocrd_tool['executable']
164
        if version is not None:
165
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
166
                                "use or override metadata/version properties instead")
167
            self._version = version
168
        if dump_json:
169
            print(json.dumps(self.ocrd_tool, indent=True))
170
            return
171
        if dump_module_dir:
172
            print(self.moduledir)
173
            return
174
        if list_resources:
175
            for res in self.list_all_resources():
176
                print(res)
177
            return
178
        if resolve_resource:
179
            try:
180
                res = self.resolve_resource(resolve_resource)
181
                print(res)
182
            except ResourceNotFoundError as e:
183
                log = getLogger('ocrd.processor.base')
184
                log.critical(e.message)
185
                sys.exit(1)
186
            return
187
        if show_resource:
188
            try:
189
                self.show_resource(show_resource)
190
            except ResourceNotFoundError as e:
191
                log = getLogger('ocrd.processor.base')
192
                log.critical(e.message)
193
                sys.exit(1)
194
            return
195
        if show_help:
196
            self.show_help(subcommand=subcommand)
197
            return
198
        if show_version:
199
            self.show_version()
200
            return
201
        self.workspace = workspace
202
        if self.workspace:
203
            # FIXME deprecate setting this and calling process() over using process_workspace()
204
            # which uses pushd_popd(self.workspace.directory)
205
            # (because there is no way to do that in process() since it's an
206
            # overridden method. chdir is almost always an anti-pattern.)
207
            self.old_pwd = getcwd()
208
            os.chdir(self.workspace.directory)
209
        self.input_file_grp = input_file_grp
210
        self.output_file_grp = output_file_grp
211
        self.page_id = None if page_id == [] or page_id is None else page_id
212
        self.download = download_files
213
        if parameter is None:
214
            parameter = {}
215
        parameterValidator = ParameterValidator(self.ocrd_tool)
216
        report = parameterValidator.validate(parameter)
217
        if not report.is_valid:
218
            raise Exception("Invalid parameters %s" % report.errors)
219
        self.parameter = parameter
220
        # workaround for deprecated#72 (deprecation does not work for subclasses):
221
        setattr(self, 'process',
222
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
223
224
    def show_help(self, subcommand=None):
225
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
226
227
    def show_version(self):
228
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
229
230
    def verify(self):
231
        """
232
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
233
        """
234
        return True
235
236
    def setup(self) -> None:
237
        """
238
        Prepare the processor for actual data processing,
239
        prior to changing to the workspace directory but
240
        after parsing parameters.
241
242
        (Override this to load models into memory etc.)
243
        """
244
        pass
245
246
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
247
    def process(self) -> None:
248
        """
249
        Process all files of the :py:attr:`workspace` 
250
        from the given :py:attr:`input_file_grp`
251
        to the given :py:attr:`output_file_grp`
252
        for the given :py:attr:`page_id` (or all pages)
253
        under the given :py:attr:`parameter`.
254
        
255
        (This contains the main functionality and needs to be overridden by subclasses.)
256
        """
257
        raise NotImplementedError()
258
259
    def process_workspace(self, workspace: Workspace) -> None:
260
        """
261
        Process all files of the given ``workspace``,
262
        from the given :py:attr:`input_file_grp`
263
        to the given :py:attr:`output_file_grp`
264
        for the given :py:attr:`page_id` (or all pages)
265
        under the given :py:attr:`parameter`.
266
267
        (This will iterate over pages and files, calling
268
        :py:meth:`process_page`, handling exceptions.)
269
        """
270
        # assert self.input_file_grp is not None
271
        # assert self.output_file_grp is not None
272
        # input_file_grps = self.input_file_grp.split(',')
273
        # for input_file_grp in input_file_grps:
274
        #     assert input_file_grp in workspace.mets.file_groups
275
        log = getLogger('ocrd.processor.base')
276
        with pushd_popd(workspace.directory):
277
            self.workspace = workspace
278
            try:
279
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
280
                for input_file_tuple in self.zip_input_files(on_error='abort'):
281
                    # FIXME: add error handling by catching exceptions in various ways (#579)
282
                    # for example:
283
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
284
                    # - transient (I/O or OOM) error → maybe sleep, retry
285
                    # - persistent (data) error → skip / dummy / raise
286
                    input_files = [None] * len(input_file_tuple)
287
                    for i, input_file in enumerate(input_file_tuple):
288
                        if i == 0:
289
                            log.info("processing page %s", input_file.pageId)
290
                        elif input_file is None:
291
                            # file/page not found in this file grp
292
                            continue
293
                        input_files[i] = input_file
294
                        if not self.download:
295
                            continue
296
                        try:
297
                            input_files[i] = self.workspace.download_file(input_file)
298
                        except ValueError as e:
299
                            log.error(repr(e))
300
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
301
                    self.process_page_file(*input_files)
302
            except NotImplementedError:
303
                # fall back to deprecated method
304
                self.process()
305
306
    def process_page_file(self, *input_files) -> None:
307
        """
308
        Process the given ``input_files`` of the :py:attr:`workspace`,
309
        representing one physical page (passed as one opened
310
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
311
        under the given :py:attr:`parameter`, and make sure the
312
        results get added accordingly.
313
314
        (This uses process_page_pcgts, but can be overridden by subclasses
315
        to handle cases like multiple fileGrps, non-PAGE input etc.)
316
        """
317
        log = getLogger('ocrd.processor.base')
318
        input_pcgts = [None] * len(input_files)
319
        page_id = input_files[0].pageId
320
        for i, input_file in enumerate(input_files):
321
            # FIXME: what about non-PAGE input like image or JSON ???
322
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
323
            try:
324
                input_pcgts[i] = page_from_file(input_file)
325
            except ValueError as e:
326
                log.info("non-PAGE input for page %s: %s", page_id, e)
327
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
328
        output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
329
        if isinstance(output_pcgts, (list, tuple)):
330
            output_images = output_pcgts[1:]
331
            output_pcgts = output_pcgts[0]
332
            for output_image_pil, output_image_id, output_image_path in output_images:
333
                self.workspace.save_image_file(
334
                    output_image_pil,
335
                    output_image_id,
336
                    self.output_file_grp,
337
                    page_id=page_id,
338
                    file_path=output_image_path)
339
        output_pcgts.set_pcGtsId(output_file_id)
340
        self.add_metadata(output_pcgts)
341
        # FIXME: what about non-PAGE output like JSON ???
342
        self.workspace.add_file(file_id=output_file_id,
343
                                file_grp=self.output_file_grp,
344
                                page_id=page_id,
345
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
346
                                mimetype=MIMETYPE_PAGE,
347
                                content=to_xml(output_pcgts))
348
349
    def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage:
350
        """
351
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
352
        representing one physical page (passed as one parsed
353
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
354
        under the given :py:attr:`parameter`, and return the
355
        resulting :py:class:`~ocrd_models.OcrdPage`.
356
357
        Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
358
        and one or more lists or tuples of :py:class:`PIL.Image` (image data),
359
        :py:class:str (file ID) and :py:class:str (file path) of derived images
360
        to be annotated along with the resulting PAGE file.
361
362
        (This contains the main functionality and must be overridden by subclasses.)
363
        """
364
        raise NotImplementedError()
365
366
    def add_metadata(self, pcgts: OcrdPage) -> None:
367
        """
368
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
369
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
370
        """
371
        pcgts.get_Metadata().add_MetadataItem(
372
                MetadataItemType(type_="processingStep",
373
                    name=self.ocrd_tool['steps'][0],
374
                    value=self.ocrd_tool['executable'],
375
                    Labels=[LabelsType(
376
                        externalModel="ocrd-tool",
377
                        externalId="parameters",
378
                        Label=[LabelType(type_=name,
379
                                         value=self.parameter[name])
380
                               for name in self.parameter.keys()]),
381
                            LabelsType(
382
                        externalModel="ocrd-tool",
383
                        externalId="version",
384
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
385
                                         value=self.version),
386
                               LabelType(type_='ocrd/core',
387
                                         value=OCRD_VERSION)])
388
                    ]))
389
390
    def resolve_resource(self, val):
391
        """
392
        Resolve a resource name to an absolute file path with the algorithm in
393
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
394
395
        Args:
396
            val (string): resource value to resolve
397
        """
398
        initLogging()
399
        executable = self.ocrd_tool['executable']
400
        log = getLogger('ocrd.processor.base')
401
        if exists(val):
402
            log.debug("Resolved to absolute path %s" % val)
403
            return val
404
        if hasattr(self, 'old_pwd'):
405
            cwd = self.old_pwd
406
        else:
407
            cwd = getcwd()
408
        ret = [cand for cand in list_resource_candidates(executable, val,
409
                                                         cwd=cwd, moduled=self.moduledir)
410
               if exists(cand)]
411
        if ret:
412
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
413
            return ret[0]
414
        raise ResourceNotFoundError(val, executable)
415
416
    def show_resource(self, val):
417
        res_fname = self.resolve_resource(val)
418
        fpath = Path(res_fname)
419
        if fpath.is_dir():
420
            with pushd_popd(fpath):
421
                fileobj = io.BytesIO()
422
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
423
                    tarball.add('.')
424
                fileobj.seek(0)
425
                copyfileobj(fileobj, sys.stdout.buffer)
426
        else:
427
            sys.stdout.buffer.write(fpath.read_bytes())
428
429
    def list_all_resources(self):
430
        """
431
        List all resources found in the filesystem and matching content-type by filename suffix
432
        """
433
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
434
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
435
            res = Path(res)
436
            if not '*/*' in mimetypes:
437
                if res.is_dir() and not 'text/directory' in mimetypes:
438
                    continue
439
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
440
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
441
                                             for mime in mimetypes):
442
                    continue
443
            yield res
444
445
    @property
446
    def module(self):
447
        """
448
        The top-level module this processor belongs to.
449
        """
450
        # find shortest prefix path that is not just a namespace package
451
        fqname = ''
452
        for name in self.__module__.split('.'):
453
            if fqname:
454
                fqname += '.'
455
            fqname += name
456
            if getattr(sys.modules[fqname], '__file__', None):
457
                return fqname
458
        # fall-back
459
        return self.__module__
460
461
    @property
462
    def moduledir(self):
463
        """
464
        The filesystem path of the module directory.
465
        """
466
        return resource_filename(self.module, '.')
467
468
    @property
469
    def input_files(self):
470
        """
471
        List the input files (for single-valued :py:attr:`input_file_grp`).
472
473
        For each physical page:
474
475
        - If there is a single PAGE-XML for the page, take it (and forget about all
476
          other files for that page)
477
        - Else if there is a single image file, take it (and forget about all other
478
          files for that page)
479
        - Otherwise raise an error (complaining that only PAGE-XML warrants
480
          having multiple images for a single page)
481
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
482
        
483
        Returns:
484
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
485
        """
486
        if not self.input_file_grp:
487
            raise ValueError("Processor is missing input fileGrp")
488
        ret = self.zip_input_files(mimetype=None, on_error='abort')
489
        if not ret:
490
            return []
491
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
492
        return [tuples[0] for tuples in ret]
493
494
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
495
        """
496
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
497
498
        Processors that expect/need multiple input file groups,
499
        cannot use :py:data:`input_files`. They must align (zip) input files
500
        across pages. This includes the case where not all pages
501
        are equally present in all file groups. It also requires
502
        making a consistent selection if there are multiple files
503
        per page.
504
505
        Following the OCR-D functional model, this function tries to
506
        find a single PAGE file per page, or fall back to a single
507
        image file per page. In either case, multiple matches per page
508
        are an error (see error handling below).
509
        This default behaviour can be changed by using a fixed MIME
510
        type filter via :py:attr:`mimetype`. But still, multiple matching
511
        files per page are an error.
512
513
        Single-page multiple-file errors are handled according to
514
        :py:attr:`on_error`:
515
516
        - if ``skip``, then the page for the respective fileGrp will be
517
          silently skipped (as if there was no match at all)
518
        - if ``first``, then the first matching file for the page will be
519
          silently selected (as if the first was the only match)
520
        - if ``last``, then the last matching file for the page will be
521
          silently selected (as if the last was the only match)
522
        - if ``abort``, then an exception will be raised.
523
        Multiple matches for PAGE-XML will always raise an exception.
524
525
        Keyword Args:
526
             require_first (boolean): If true, then skip a page entirely
527
                 whenever it is not available in the first input `fileGrp`.
528
             mimetype (string): If not `None`, filter by the specified MIME
529
                 type (literal or regex prefixed by `//`). Otherwise prefer
530
                 PAGE or image.
531
        Returns:
532
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
533
        """
534
        if not self.input_file_grp:
535
            raise ValueError("Processor is missing input fileGrp")
536
537
        LOG = getLogger('ocrd.processor.base')
538
        ifgs = self.input_file_grp.split(",")
539
        # Iterating over all files repeatedly may seem inefficient at first sight,
540
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
541
        # can actually be much more costly than traversing the ltree.
542
        # This might depend on the number of pages vs number of fileGrps.
543
544
        pages = dict()
545
        for i, ifg in enumerate(ifgs):
546
            files_ = sorted(self.workspace.mets.find_all_files(
547
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
548
                                # sort by MIME type so PAGE comes before images
549
                                key=lambda file_: file_.mimetype)
550
            # Warn if no files found but pageId was specified because that
551
            # might be because of invalid page_id (range)
552
            if self.page_id and not files_:
553
                msg = (f"Could not find any files for --page-id {self.page_id} - "
554
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
555
                if on_error == 'abort':
556
                    raise ValueError(msg)
557
                LOG.warning(msg)
558
            for file_ in files_:
559
                if not file_.pageId:
560
                    continue
561
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
562
                if ift[i]:
563
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
564
                    # fileGrp has multiple files for this page ID
565
                    if mimetype:
566
                        # filter was active, this must not happen
567 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
568
                            ift[i] = None
569
                        elif on_error == 'first':
570
                            pass # keep first match
571
                        elif on_error == 'last':
572
                            ift[i] = file_
573
                        elif on_error == 'abort':
574
                            raise ValueError(
575
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
576
                                    mimetype, file_.pageId, ifg))
577
                        else:
578
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
579
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
580
                          file_.mimetype != MIMETYPE_PAGE):
581
                        pass # keep PAGE match
582
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
583
                          file_.mimetype == MIMETYPE_PAGE):
584
                        raise ValueError(
585
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
586
                                file_.pageId, ifg))
587
                    else:
588
                        # filter was inactive but no PAGE is in control, this must not happen
589 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
590
                            ift[i] = None
591
                        elif on_error == 'first':
592
                            pass # keep first match
593
                        elif on_error == 'last':
594
                            ift[i] = file_
595
                        elif on_error == 'abort':
596
                            raise ValueError(
597
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
598
                                    file_.pageId, ifg))
599
                        else:
600
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
601
                else:
602
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
603
                    ift[i] = file_
604
        ifts = list()
605
        for page, ifiles in pages.items():
606
            for i, ifg in enumerate(ifgs):
607
                if not ifiles[i]:
608
                    # other fallback options?
609
                    LOG.error('found no page %s in file group %s',
610
                              page, ifg)
611
            if ifiles[0] or not require_first:
612
                ifts.append(tuple(ifiles))
613
        return ifts
614