Passed
Pull Request — master (#1240)
by
unknown
02:23
created

ocrd.processor.base.Processor.ocrd_tool()   A

Complexity

Conditions 2

Size

Total Lines 7
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 7
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists, join
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import List, Optional, Union
19
import sys
20
import inspect
21
import tarfile
22
import io
23
from deprecated import deprecated
24
25
from ocrd.workspace import Workspace
26
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
27
from ocrd.processor.ocrd_page_result import OcrdPageResult
28
from ocrd_utils import (
29
    VERSION as OCRD_VERSION,
30
    MIMETYPE_PAGE,
31
    MIME_TO_EXT,
32
    getLogger,
33
    initLogging,
34
    list_resource_candidates,
35
    pushd_popd,
36
    list_all_resources,
37
    get_processor_resource_types,
38
    resource_filename,
39
    resource_string,
40
    make_file_id,
41
    deprecation_warning
42
)
43
from ocrd_validators import ParameterValidator
44
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
45
from ocrd_modelfactory import page_from_file
46
47
# XXX imports must remain for backwards-compatibility
48
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
49
50
class ResourceNotFoundError(FileNotFoundError):
51
    """
52
    An exception signifying the requested processor resource
53
    cannot be resolved.
54
    """
55
    def __init__(self, name, executable):
56
        self.name = name
57
        self.executable = executable
58
        self.message = "Could not find resource '%s' for executable '%s'. " \
59
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
60
                       % (name, executable, executable, name)
61
        super().__init__(self.message)
62
63
class Processor():
64
    """
65
    A processor is a tool that implements the uniform OCR-D command-line interface
66
    for run-time data processing. That is, it executes a single workflow step,
67
    or a combination of workflow steps, on the workspace (represented by local METS).
68
    It reads input files for all or requested physical pages of the input fileGrp(s),
69
    and writes output files for them into the output fileGrp(s). It may take 
70
    a number of optional or mandatory parameters.
71
    """
72
73
    @property
74
    def metadata(self):
75
        """the ocrd-tool.json dict of the package"""
76
        if hasattr(self, '_metadata'):
77
            return self._metadata
78
        self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
79
        return self._metadata
80
81
    @property
82
    def version(self):
83
        """the version of the package"""
84
        if hasattr(self, '_version'):
85
            return self._version
86
        self._version = self.metadata['version']
87
        return self._version
88
89
    @property
90
    def executable(self):
91
        """the executable name of this processor tool"""
92
        if hasattr(self, '_executable'):
93
            return self._executable
94
        self._executable = os.path.basename(inspect.stack()[-1].filename)
95
        return self._executable
96
97
    @property
98
    def ocrd_tool(self):
99
        """the ocrd-tool.json dict of this processor tool"""
100
        if hasattr(self, '_ocrd_tool'):
101
            return self._ocrd_tool
102
        self._ocrd_tool = self.metadata['tools'][self.executable]
103
        return self._ocrd_tool
104
105
    def __init__(
106
            self,
107
            # FIXME: deprecate in favor of process_workspace(workspace)
108
            workspace : Optional[Workspace],
109
            ocrd_tool=None,
110
            parameter=None,
111
            input_file_grp=None,
112
            output_file_grp=None,
113
            page_id=None,
114
            download_files=True,
115
            version=None
116
    ):
117
        """
118
        Instantiate, but do not process. Unless ``list_resources`` or
119
        ``show_resource`` or ``show_help`` or ``show_version`` or
120
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
121
        (parsing and validating parameters, entering the workspace directory).
122
123
        Args:
124
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
125
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
126
                 before processing.
127
        Keyword Args:
128
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
129
                 Can be ``None`` even for processing, but then needs to be set before running.
130
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \
131
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
132
                 before processing.
133
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \
134
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
135
                 before processing.
136
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
137
                 (or empty for all pages). \
138
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
139
                 before processing.
140
             download_files (boolean): Whether input files will be downloaded prior to processing.
141
        """
142
        if ocrd_tool is not None:
143
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
144
                                "use or override metadata/executable/ocrd-tool properties instead")
145
            self._ocrd_tool = ocrd_tool
146
            self._executable = ocrd_tool['executable']
147
        if version is not None:
148
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
149
                                "use or override metadata/version properties instead")
150
            self._version = version
151
        if workspace is not None:
152
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
153
                                "is deprecated - pass as argument to process_workspace instead")
154
            self.workspace = workspace
155
            self.old_pwd = getcwd()
156
            os.chdir(self.workspace.directory)
157
        if input_file_grp is not None:
158
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
159
                                "is deprecated - pass as argument to process_workspace instead")
160
            self.input_file_grp = input_file_grp
161
        if output_file_grp is not None:
162
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
163
                                "is deprecated - pass as argument to process_workspace instead")
164
            self.output_file_grp = output_file_grp
165
        if page_id is not None:
166
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
167
                                "is deprecated - pass as argument to process_workspace instead")
168
            self.page_id = page_id or None
169
        self.download = download_files
170
        if parameter is None:
171
            parameter = {}
172
        parameterValidator = ParameterValidator(self.ocrd_tool)
173
174
        report = parameterValidator.validate(parameter)
175
        if not report.is_valid:
176
            raise ValueError("Invalid parameters %s" % report.errors)
177
        self.parameter = parameter
178
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
179
        setattr(self, 'process',
180
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
181
182
    def show_help(self, subcommand=None):
183
        """
184
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
185
        parameters and docstrings.
186
        """
187
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
188
189
    def show_version(self):
190
        """
191
        Print information on this processor's version and OCR-D version.
192
        """
193
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
194
195
    def verify(self):
196
        """
197
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
198
        """
199
        assert self.input_file_grp is not None
200
        assert self.output_file_grp is not None
201
        input_file_grps = self.input_file_grp.split(',')
202
        output_file_grps = self.output_file_grp.split(',')
203
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
204
            if isinstance(spec, int):
205
                if spec > 0:
206
                    assert len(grps) == spec, msg % (len(grps), str(spec))
207
            else:
208
                assert isinstance(spec, list)
209
                minimum = spec[0]
210
                maximum = spec[1]
211
                if minimum > 0:
212
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
213
                if maximum > 0:
214
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
215
        # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
216
        # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
217
        if 'input_file_grp_cardinality' in self.ocrd_tool:
218
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
219
                                        "Unexpected number of input file groups %d vs %s")
220
        if 'output_file_grp_cardinality' in self.ocrd_tool:
221
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
222
                                        "Unexpected number of output file groups %d vs %s")
223
        for input_file_grp in input_file_grps:
224
            assert input_file_grp in self.workspace.mets.file_groups
225
        # keep this for backwards compatibility:
226
        return True
227
228
    def dump_json(self):
229
        """
230
        Print :py:attr:`ocrd_tool` on stdout.
231
        """
232
        print(json.dumps(self.ocrd_tool, indent=True))
233
        return
234
235
    def dump_module_dir(self):
236
        """
237
        Print :py:attr:`moduledir` on stdout.
238
        """
239
        print(self.moduledir)
240
        return
241
242
    def list_resources(self):
243
        """
244
        Find all installed resource files in the search paths and print their path names.
245
        """
246
        for res in self.list_all_resources():
247
            print(res)
248
        return
249
250
    def setup(self) -> None:
251
        """
252
        Prepare the processor for actual data processing,
253
        prior to changing to the workspace directory but
254
        after parsing parameters.
255
256
        (Override this to load models into memory etc.)
257
        """
258
        pass
259
260
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
261
    def process(self) -> None:
262
        """
263
        Process all files of the :py:attr:`workspace`
264
        from the given :py:attr:`input_file_grp`
265
        to the given :py:attr:`output_file_grp`
266
        for the given :py:attr:`page_id` (or all pages)
267
        under the given :py:attr:`parameter`.
268
269
        (This contains the main functionality and needs to be overridden by subclasses.)
270
        """
271
        raise NotImplementedError()
272
273
    def process_workspace(self, workspace: Workspace) -> None:
274
        """
275
        Process all files of the given ``workspace``,
276
        from the given :py:attr:`input_file_grp`
277
        to the given :py:attr:`output_file_grp`
278
        for the given :py:attr:`page_id` (or all pages)
279
        under the given :py:attr:`parameter`.
280
281
        (This will iterate over pages and files, calling
282
        :py:meth:`process_page`, handling exceptions.)
283
        """
284
        log = getLogger('ocrd.processor.base')
285
        with pushd_popd(workspace.directory):
286
            self.workspace = workspace
287
            self.verify()
288
            try:
289
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
290
                for input_file_tuple in self.zip_input_files(on_error='abort'):
291
                    # FIXME: add error handling by catching exceptions in various ways (#579)
292
                    # for example:
293
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
294
                    # - transient (I/O or OOM) error → maybe sleep, retry
295
                    # - persistent (data) error → skip / dummy / raise
296
                    input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
297
                    for i, input_file in enumerate(input_file_tuple):
298
                        if i == 0:
299
                            log.info("processing page %s", input_file.pageId)
300
                        elif input_file is None:
301
                            # file/page not found in this file grp
302
                            continue
303
                        input_files[i] = input_file
304
                        if not self.download:
305
                            continue
306
                        try:
307
                            input_files[i] = self.workspace.download_file(input_file)
308
                        except ValueError as e:
309
                            log.error(repr(e))
310
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
311
                    self.process_page_file(*input_files)
312
            except NotImplementedError:
313
                # fall back to deprecated method
314
                self.process()
315
316
    def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
317
        """
318
        Process the given ``input_files`` of the :py:attr:`workspace`,
319
        representing one physical page (passed as one opened
320
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
321
        under the given :py:attr:`parameter`, and make sure the
322
        results get added accordingly.
323
324
        (This uses process_page_pcgts, but can be overridden by subclasses
325
        to handle cases like multiple fileGrps, non-PAGE input etc.)
326
        """
327
        log = getLogger('ocrd.processor.base')
328
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
329
        assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile))
330
        page_id = input_files[0].pageId
331
        for i, input_file in enumerate(input_files):
332
            assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
333
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
334
            try:
335
                page_ = page_from_file(input_file)
336
                assert isinstance(page_, OcrdPage)
337
                input_pcgts[i] = page_
338
            except ValueError as e:
339
                # not PAGE and not an image to generate PAGE for
340
                log.info("non-PAGE input for page %s: %s", page_id, e)
341
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
342
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
343
        for image_result in result.images:
344
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
345
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
346
            image_result.alternative_image.set_filename(image_file_path)
347
            self.workspace.save_image_file(
348
                image_result.pil,
349
                image_file_id,
350
                self.output_file_grp,
351
                page_id=page_id,
352
                file_path=image_file_path)
353
        result.pcgts.set_pcGtsId(output_file_id)
354
        self.add_metadata(result.pcgts)
355
        # FIXME: what about non-PAGE output like JSON ???
356
        self.workspace.add_file(file_id=output_file_id,
357
                                file_grp=self.output_file_grp,
358
                                page_id=page_id,
359
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
360
                                mimetype=MIMETYPE_PAGE,
361
                                content=to_xml(result.pcgts))
362
363
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
364
        """
365
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
366
        representing one physical page (passed as one parsed
367
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
368
        under the given :py:attr:`parameter`, and return the
369
        resulting :py:class:`~ocrd.processor.OcrdPageResult`.
370
371
        Optionally, add to the ``images`` attribute of the resulting
372
        :py:class:`~ocrd.processor.OcrdPageResult` instances
373
        of :py:class:`~ocrd.processor.OcrdPageResultImage`,
374
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
375
        ``file_id_suffix`` (used for generating IDs of the saved image) and
376
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
377
        for setting the filename of the saved image).
378
379
        (This contains the main functionality and must be overridden by subclasses.)
380
        """
381
        raise NotImplementedError()
382
383
    def add_metadata(self, pcgts: OcrdPage) -> None:
384
        """
385
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
386
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
387
        """
388
        metadata_obj = pcgts.get_Metadata()
389
        assert metadata_obj is not None
390
        metadata_obj.add_MetadataItem(
391
                MetadataItemType(type_="processingStep",
392
                    name=self.ocrd_tool['steps'][0],
393
                    value=self.ocrd_tool['executable'],
394
                    Labels=[LabelsType(
395
                        externalModel="ocrd-tool",
396
                        externalId="parameters",
397
                        Label=[LabelType(type_=name,
398
                                         value=self.parameter[name])
399
                               for name in self.parameter.keys()]),
400
                            LabelsType(
401
                        externalModel="ocrd-tool",
402
                        externalId="version",
403
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
404
                                         value=self.version),
405
                               LabelType(type_='ocrd/core',
406
                                         value=OCRD_VERSION)])
407
                    ]))
408
409
    def resolve_resource(self, val):
410
        """
411
        Resolve a resource name to an absolute file path with the algorithm in
412
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
413
414
        Args:
415
            val (string): resource value to resolve
416
        """
417
        initLogging()
418
        executable = self.ocrd_tool['executable']
419
        log = getLogger('ocrd.processor.base')
420
        if exists(val):
421
            log.debug("Resolved to absolute path %s" % val)
422
            return val
423
        if hasattr(self, 'old_pwd'):
424
            cwd = self.old_pwd
425
        else:
426
            cwd = getcwd()
427
        ret = [cand for cand in list_resource_candidates(executable, val,
428
                                                         cwd=cwd, moduled=self.moduledir)
429
               if exists(cand)]
430
        if ret:
431
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
432
            return ret[0]
433
        raise ResourceNotFoundError(val, executable)
434
435
    def show_resource(self, val):
436
        """
437
        Resolve a resource name to a file path with the algorithm in
438
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters,
439
        then print its contents to stdout.
440
441
        Args:
442
            val (string): resource value to show
443
        """
444
445
        res_fname = self.resolve_resource(val)
446
        fpath = Path(res_fname)
447
        if fpath.is_dir():
448
            with pushd_popd(fpath):
449
                fileobj = io.BytesIO()
450
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
451
                    tarball.add('.')
452
                fileobj.seek(0)
453
                copyfileobj(fileobj, sys.stdout.buffer)
454
        else:
455
            sys.stdout.buffer.write(fpath.read_bytes())
456
457
    def list_all_resources(self):
458
        """
459
        List all resources found in the filesystem and matching content-type by filename suffix
460
        """
461
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
462
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
463
            res = Path(res)
464
            if not '*/*' in mimetypes:
465
                if res.is_dir() and not 'text/directory' in mimetypes:
466
                    continue
467
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
468
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
469
                                             for mime in mimetypes):
470
                    continue
471
            yield res
472
473
    @property
474
    def module(self):
475
        """
476
        The top-level module this processor belongs to.
477
        """
478
        # find shortest prefix path that is not just a namespace package
479
        fqname = ''
480
        for name in self.__module__.split('.'):
481
            if fqname:
482
                fqname += '.'
483
            fqname += name
484
            if getattr(sys.modules[fqname], '__file__', None):
485
                return fqname
486
        # fall-back
487
        return self.__module__
488
489
    @property
490
    def moduledir(self):
491
        """
492
        The filesystem path of the module directory.
493
        """
494
        return resource_filename(self.module, '.')
495
496
    @property
497
    def input_files(self):
498
        """
499
        List the input files (for single-valued :py:attr:`input_file_grp`).
500
501
        For each physical page:
502
503
        - If there is a single PAGE-XML for the page, take it (and forget about all
504
          other files for that page)
505
        - Else if there is a single image file, take it (and forget about all other
506
          files for that page)
507
        - Otherwise raise an error (complaining that only PAGE-XML warrants
508
          having multiple images for a single page)
509
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
510
511
        Returns:
512
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
513
        """
514
        if not self.input_file_grp:
515
            raise ValueError("Processor is missing input fileGrp")
516
        ret = self.zip_input_files(mimetype=None, on_error='abort')
517
        if not ret:
518
            return []
519
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
520
        return [tuples[0] for tuples in ret]
521
522
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
523
        """
524
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
525
526
        Processors that expect/need multiple input file groups,
527
        cannot use :py:data:`input_files`. They must align (zip) input files
528
        across pages. This includes the case where not all pages
529
        are equally present in all file groups. It also requires
530
        making a consistent selection if there are multiple files
531
        per page.
532
533
        Following the OCR-D functional model, this function tries to
534
        find a single PAGE file per page, or fall back to a single
535
        image file per page. In either case, multiple matches per page
536
        are an error (see error handling below).
537
        This default behaviour can be changed by using a fixed MIME
538
        type filter via :py:attr:`mimetype`. But still, multiple matching
539
        files per page are an error.
540
541
        Single-page multiple-file errors are handled according to
542
        :py:attr:`on_error`:
543
544
        - if ``skip``, then the page for the respective fileGrp will be
545
          silently skipped (as if there was no match at all)
546
        - if ``first``, then the first matching file for the page will be
547
          silently selected (as if the first was the only match)
548
        - if ``last``, then the last matching file for the page will be
549
          silently selected (as if the last was the only match)
550
        - if ``abort``, then an exception will be raised.
551
        Multiple matches for PAGE-XML will always raise an exception.
552
553
        Keyword Args:
554
             require_first (boolean): If true, then skip a page entirely
555
                 whenever it is not available in the first input `fileGrp`.
556
             mimetype (string): If not `None`, filter by the specified MIME
557
                 type (literal or regex prefixed by `//`). Otherwise prefer
558
                 PAGE or image.
559
        Returns:
560
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
561
        """
562
        if not self.input_file_grp:
563
            raise ValueError("Processor is missing input fileGrp")
564
565
        LOG = getLogger('ocrd.processor.base')
566
        ifgs = self.input_file_grp.split(",")
567
        # Iterating over all files repeatedly may seem inefficient at first sight,
568
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
569
        # can actually be much more costly than traversing the ltree.
570
        # This might depend on the number of pages vs number of fileGrps.
571
572
        pages = dict()
573
        for i, ifg in enumerate(ifgs):
574
            files_ = sorted(self.workspace.mets.find_all_files(
575
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
576
                                # sort by MIME type so PAGE comes before images
577
                                key=lambda file_: file_.mimetype)
578
            # Warn if no files found but pageId was specified because that
579
            # might be because of invalid page_id (range)
580
            if self.page_id and not files_:
581
                msg = (f"Could not find any files for --page-id {self.page_id} - "
582
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
583
                if on_error == 'abort':
584
                    raise ValueError(msg)
585
                LOG.warning(msg)
586
            for file_ in files_:
587
                if not file_.pageId:
588
                    continue
589
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
590
                if ift[i]:
591
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
592
                    # fileGrp has multiple files for this page ID
593
                    if mimetype:
594
                        # filter was active, this must not happen
595 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
596
                            ift[i] = None
597
                        elif on_error == 'first':
598
                            pass # keep first match
599
                        elif on_error == 'last':
600
                            ift[i] = file_
601
                        elif on_error == 'abort':
602
                            raise ValueError(
603
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
604
                                    mimetype, file_.pageId, ifg))
605
                        else:
606
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
607
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
608
                          file_.mimetype != MIMETYPE_PAGE):
609
                        pass # keep PAGE match
610
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
611
                          file_.mimetype == MIMETYPE_PAGE):
612
                        raise ValueError(
613
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
614
                                file_.pageId, ifg))
615
                    else:
616
                        # filter was inactive but no PAGE is in control, this must not happen
617 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
618
                            ift[i] = None
619
                        elif on_error == 'first':
620
                            pass # keep first match
621
                        elif on_error == 'last':
622
                            ift[i] = file_
623
                        elif on_error == 'abort':
624
                            raise ValueError(
625
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
626
                                    file_.pageId, ifg))
627
                        else:
628
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
629
                else:
630
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
631
                    ift[i] = file_
632
        ifts = list()
633
        for page, ifiles in pages.items():
634
            for i, ifg in enumerate(ifgs):
635
                if not ifiles[i]:
636
                    # other fallback options?
637
                    LOG.error('found no page %s in file group %s',
638
                              page, ifg)
639
            if ifiles[0] or not require_first:
640
                ifts.append(tuple(ifiles))
641
        return ifts
642