Passed
Pull Request — master (#1240)
by Konstantin
02:57
created

ocrd.processor.base.Processor.moduledir()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists, join
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import List, Optional, Union
19
import sys
20
import inspect
21
import tarfile
22
import io
23
from deprecated import deprecated
24
25
from ocrd.workspace import Workspace
26
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
27
from ocrd.processor.ocrd_page_result import OcrdPageResult
28
from ocrd_models.ocrd_page_generateds import PcGtsType
29
from ocrd_utils import (
30
    VERSION as OCRD_VERSION,
31
    MIMETYPE_PAGE,
32
    MIME_TO_EXT,
33
    getLogger,
34
    initLogging,
35
    list_resource_candidates,
36
    pushd_popd,
37
    list_all_resources,
38
    get_processor_resource_types,
39
    resource_filename,
40
    resource_string,
41
    make_file_id,
42
    deprecation_warning
43
)
44
from ocrd_validators import ParameterValidator
45
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
46
from ocrd_modelfactory import page_from_file
47
48
# XXX imports must remain for backwards-compatibility
49
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
50
51
class ResourceNotFoundError(FileNotFoundError):
52
    """
53
    An exception signifying the requested processor resource
54
    cannot be resolved.
55
    """
56
    def __init__(self, name, executable):
57
        self.name = name
58
        self.executable = executable
59
        self.message = "Could not find resource '%s' for executable '%s'. " \
60
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
61
                       % (name, executable, executable, name)
62
        super().__init__(self.message)
63
64
class Processor():
65
    """
66
    A processor is a tool that implements the uniform OCR-D command-line interface
67
    for run-time data processing. That is, it executes a single workflow step,
68
    or a combination of workflow steps, on the workspace (represented by local METS).
69
    It reads input files for all or requested physical pages of the input fileGrp(s),
70
    and writes output files for them into the output fileGrp(s). It may take 
71
    a number of optional or mandatory parameters.
72
    """
73
74
    @property
75
    def metadata(self):
76
        """the ocrd-tool.json dict of the package"""
77
        if hasattr(self, '_metadata'):
78
            return self._metadata
79
        self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
80
        return self._metadata
81
82
    @property
83
    def version(self):
84
        """the version of the package"""
85
        if hasattr(self, '_version'):
86
            return self._version
87
        self._version = self.metadata['version']
88
        return self._version
89
90
    @property
91
    def executable(self):
92
        """the executable name of this processor tool"""
93
        if hasattr(self, '_executable'):
94
            return self._executable
95
        self._executable = os.path.basename(inspect.stack()[-1].filename)
96
        return self._executable
97
98
    @property
99
    def ocrd_tool(self):
100
        """the ocrd-tool.json dict of this processor tool"""
101
        if hasattr(self, '_ocrd_tool'):
102
            return self._ocrd_tool
103
        self._ocrd_tool = self.metadata['tools'][self.executable]
104
        return self._ocrd_tool
105
106
    def __init__(
107
            self,
108
            # FIXME: deprecate in favor of process_workspace(workspace)
109
            workspace : Optional[Workspace],
110
            ocrd_tool=None,
111
            parameter=None,
112
            input_file_grp=None,
113
            output_file_grp=None,
114
            page_id=None,
115
            download_files=True,
116
            version=None
117
    ):
118
        """
119
        Instantiate, but do not process. Unless ``list_resources`` or
120
        ``show_resource`` or ``show_help`` or ``show_version`` or
121
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
122
        (parsing and validating parameters, entering the workspace directory).
123
124
        Args:
125
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
126
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
127
                 before processing.
128
        Keyword Args:
129
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
130
                 Can be ``None`` even for processing, but then needs to be set before running.
131
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \
132
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
133
                 before processing.
134
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \
135
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
136
                 before processing.
137
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
138
                 (or empty for all pages). \
139
                 Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
140
                 before processing.
141
             download_files (boolean): Whether input files will be downloaded prior to processing.
142
        """
143
        if ocrd_tool is not None:
144
            deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
145
                                "use or override metadata/executable/ocrd-tool properties instead")
146
            self._ocrd_tool = ocrd_tool
147
            self._executable = ocrd_tool['executable']
148
        if version is not None:
149
            deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
150
                                "use or override metadata/version properties instead")
151
            self._version = version
152
        if workspace is not None:
153
            deprecation_warning("Passing a workspace argument other than 'None' to Processor "
154
                                "is deprecated - pass as argument to process_workspace instead")
155
            self.workspace = workspace
156
            self.old_pwd = getcwd()
157
            os.chdir(self.workspace.directory)
158
        if input_file_grp is not None:
159
            deprecation_warning("Passing an input_file_grp kwarg other than 'None' to Processor "
160
                                "is deprecated - pass as argument to process_workspace instead")
161
            self.input_file_grp = input_file_grp
162
        if output_file_grp is not None:
163
            deprecation_warning("Passing an output_file_grp kwarg other than 'None' to Processor "
164
                                "is deprecated - pass as argument to process_workspace instead")
165
            self.output_file_grp = output_file_grp
166
        if page_id is not None:
167
            deprecation_warning("Passing a page_id kwarg other than 'None' to Processor "
168
                                "is deprecated - pass as argument to process_workspace instead")
169
            self.page_id = page_id or None
170
        self.download = download_files
171
        if parameter is None:
172
            parameter = {}
173
        parameterValidator = ParameterValidator(self.ocrd_tool)
174
175
        report = parameterValidator.validate(parameter)
176
        if not report.is_valid:
177
            raise ValueError("Invalid parameters %s" % report.errors)
178
        self.parameter = parameter
179
        # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
180
        setattr(self, 'process',
181
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
182
183
    def show_help(self, subcommand=None):
184
        """
185
        Print a usage description including the standard CLI and all of this processor's ocrd-tool
186
        parameters and docstrings.
187
        """
188
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
189
190
    def show_version(self):
191
        """
192
        Print information on this processor's version and OCR-D version.
193
        """
194
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
195
196
    def verify(self):
197
        """
198
        Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
199
        """
200
        assert self.input_file_grp is not None
201
        assert self.output_file_grp is not None
202
        input_file_grps = self.input_file_grp.split(',')
203
        output_file_grps = self.output_file_grp.split(',')
204
        def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
205
            if isinstance(spec, int):
206
                if spec > 0:
207
                    assert len(grps) == spec, msg % (len(grps), str(spec))
208
            else:
209
                assert isinstance(spec, list)
210
                minimum = spec[0]
211
                maximum = spec[1]
212
                if minimum > 0:
213
                    assert len(grps) >= minimum, msg % (len(grps), str(spec))
214
                if maximum > 0:
215
                    assert len(grps) <= maximum, msg % (len(grps), str(spec))
216
        # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
217
        # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
218
        if 'input_file_grp_cardinality' in self.ocrd_tool:
219
            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
220
                                        "Unexpected number of input file groups %d vs %s")
221
        if 'output_file_grp_cardinality' in self.ocrd_tool:
222
            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
223
                                        "Unexpected number of output file groups %d vs %s")
224
        for input_file_grp in input_file_grps:
225
            assert input_file_grp in self.workspace.mets.file_groups
226
        # keep this for backwards compatibility:
227
        return True
228
229
    def dump_json(self):
230
        """
231
        Print :py:attr:`ocrd_tool` on stdout.
232
        """
233
        print(json.dumps(self.ocrd_tool, indent=True))
234
        return
235
236
    def dump_module_dir(self):
237
        """
238
        Print :py:attr:`moduledir` on stdout.
239
        """
240
        print(self.moduledir)
241
        return
242
243
    def list_resources(self):
244
        """
245
        Find all installed resource files in the search paths and print their path names.
246
        """
247
        for res in self.list_all_resources():
248
            print(res)
249
        return
250
251
    def setup(self) -> None:
252
        """
253
        Prepare the processor for actual data processing,
254
        prior to changing to the workspace directory but
255
        after parsing parameters.
256
257
        (Override this to load models into memory etc.)
258
        """
259
        pass
260
261
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
262
    def process(self) -> None:
263
        """
264
        Process all files of the :py:attr:`workspace`
265
        from the given :py:attr:`input_file_grp`
266
        to the given :py:attr:`output_file_grp`
267
        for the given :py:attr:`page_id` (or all pages)
268
        under the given :py:attr:`parameter`.
269
270
        (This contains the main functionality and needs to be overridden by subclasses.)
271
        """
272
        raise NotImplementedError()
273
274
    def process_workspace(self, workspace: Workspace) -> None:
275
        """
276
        Process all files of the given ``workspace``,
277
        from the given :py:attr:`input_file_grp`
278
        to the given :py:attr:`output_file_grp`
279
        for the given :py:attr:`page_id` (or all pages)
280
        under the given :py:attr:`parameter`.
281
282
        (This will iterate over pages and files, calling
283
        :py:meth:`process_page`, handling exceptions.)
284
        """
285
        log = getLogger('ocrd.processor.base')
286
        with pushd_popd(workspace.directory):
287
            self.workspace = workspace
288
            self.verify()
289
            try:
290
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
291
                for input_file_tuple in self.zip_input_files(on_error='abort'):
292
                    # FIXME: add error handling by catching exceptions in various ways (#579)
293
                    # for example:
294
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
295
                    # - transient (I/O or OOM) error → maybe sleep, retry
296
                    # - persistent (data) error → skip / dummy / raise
297
                    input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
298
                    for i, input_file in enumerate(input_file_tuple):
299
                        if i == 0:
300
                            log.info("processing page %s", input_file.pageId)
301
                        elif input_file is None:
302
                            # file/page not found in this file grp
303
                            continue
304
                        input_files[i] = input_file
305
                        if not self.download:
306
                            continue
307
                        try:
308
                            input_files[i] = self.workspace.download_file(input_file)
309
                        except ValueError as e:
310
                            log.error(repr(e))
311
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
312
                    self.process_page_file(*input_files)
313
            except NotImplementedError:
314
                # fall back to deprecated method
315
                self.process()
316
317
    def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
318
        """
319
        Process the given ``input_files`` of the :py:attr:`workspace`,
320
        representing one physical page (passed as one opened
321
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
322
        under the given :py:attr:`parameter`, and make sure the
323
        results get added accordingly.
324
325
        (This uses process_page_pcgts, but can be overridden by subclasses
326
        to handle cases like multiple fileGrps, non-PAGE input etc.)
327
        """
328
        log = getLogger('ocrd.processor.base')
329
        input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
330
        assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile))
331
        page_id = input_files[0].pageId
332
        for i, input_file in enumerate(input_files):
333
            assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
334
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
335
            try:
336
                page_ = page_from_file(input_file)
337
                assert isinstance(page_, PcGtsType)
338
                input_pcgts[i] = page_
339
            except ValueError as e:
340
                log.info("non-PAGE input for page %s: %s", page_id, e)
341
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
342
        result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
343
        for image_result in result.images:
344
            image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
345
            image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
346
            image_result.alternative_image.set_filename(image_file_path)
347
            self.workspace.save_image_file(
348
                image_result.pil,
349
                image_file_id,
350
                self.output_file_grp,
351
                page_id=page_id,
352
                file_path=image_file_path)
353
        result.pcgts.set_pcGtsId(output_file_id)
354
        self.add_metadata(result.pcgts)
355
        # FIXME: what about non-PAGE output like JSON ???
356
        self.workspace.add_file(file_id=output_file_id,
357
                                file_grp=self.output_file_grp,
358
                                page_id=page_id,
359
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
360
                                mimetype=MIMETYPE_PAGE,
361
                                content=to_xml(result.pcgts))
362
363
    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
364
        """
365
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
366
        representing one physical page (passed as one parsed
367
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
368
        under the given :py:attr:`parameter`, and return the
369
        resulting :py:class:`~ocrd.processor.OcrdPageResult`.
370
371
        Optionally, add to the ``images`` attribute of the resulting
372
        :py:class:`~ocrd.processor.OcrdPageResult` instances
373
        of :py:class:`~ocrd.processor.OcrdPageResultImage`,
374
        which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
375
        ``file_id_suffix`` (used for generating IDs of the saved image) and
376
        ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
377
        for setting the filename of the saved image).
378
379
        (This contains the main functionality and must be overridden by subclasses.)
380
        """
381
        raise NotImplementedError()
382
383
    def add_metadata(self, pcgts: OcrdPage) -> None:
384
        """
385
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
386
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
387
        """
388
        metadata_obj = pcgts.get_Metadata()
389
        assert metadata_obj is not None
390
        metadata_obj.add_MetadataItem(
391
                MetadataItemType(type_="processingStep",
392
                    name=self.ocrd_tool['steps'][0],
393
                    value=self.ocrd_tool['executable'],
394
                    Labels=[LabelsType(
395
                        externalModel="ocrd-tool",
396
                        externalId="parameters",
397
                        Label=[LabelType(type_=name,
398
                                         value=self.parameter[name])
399
                               for name in self.parameter.keys()]),
400
                            LabelsType(
401
                        externalModel="ocrd-tool",
402
                        externalId="version",
403
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
404
                                         value=self.version),
405
                               LabelType(type_='ocrd/core',
406
                                         value=OCRD_VERSION)])
407
                    ]))
408
409
    def resolve_resource(self, val):
410
        """
411
        Resolve a resource name to an absolute file path with the algorithm in
412
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
413
414
        Args:
415
            val (string): resource value to resolve
416
        """
417
        initLogging()
418
        executable = self.ocrd_tool['executable']
419
        log = getLogger('ocrd.processor.base')
420
        if exists(val):
421
            log.debug("Resolved to absolute path %s" % val)
422
            return val
423
        if hasattr(self, 'old_pwd'):
424
            cwd = self.old_pwd
425
        else:
426
            cwd = getcwd()
427
        ret = [cand for cand in list_resource_candidates(executable, val,
428
                                                         cwd=cwd, moduled=self.moduledir)
429
               if exists(cand)]
430
        if ret:
431
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
432
            return ret[0]
433
        raise ResourceNotFoundError(val, executable)
434
435
    def show_resource(self, val):
436
        """
437
        Resolve a resource name to a file path with the algorithm in
438
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters,
439
        then print its contents to stdout.
440
441
        Args:
442
            val (string): resource value to show
443
        """
444
445
        res_fname = self.resolve_resource(val)
446
        fpath = Path(res_fname)
447
        if fpath.is_dir():
448
            with pushd_popd(fpath):
449
                fileobj = io.BytesIO()
450
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
451
                    tarball.add('.')
452
                fileobj.seek(0)
453
                copyfileobj(fileobj, sys.stdout.buffer)
454
        else:
455
            sys.stdout.buffer.write(fpath.read_bytes())
456
457
    def list_all_resources(self):
458
        """
459
        List all resources found in the filesystem and matching content-type by filename suffix
460
        """
461
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
462
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
463
            res = Path(res)
464
            if not '*/*' in mimetypes:
465
                if res.is_dir() and not 'text/directory' in mimetypes:
466
                    continue
467
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
468
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
469
                                             for mime in mimetypes):
470
                    continue
471
            yield res
472
473
    @property
474
    def module(self):
475
        """
476
        The top-level module this processor belongs to.
477
        """
478
        # find shortest prefix path that is not just a namespace package
479
        fqname = ''
480
        for name in self.__module__.split('.'):
481
            if fqname:
482
                fqname += '.'
483
            fqname += name
484
            if getattr(sys.modules[fqname], '__file__', None):
485
                return fqname
486
        # fall-back
487
        return self.__module__
488
489
    @property
490
    def moduledir(self):
491
        """
492
        The filesystem path of the module directory.
493
        """
494
        return resource_filename(self.module, '.')
495
496
    @property
497
    def input_files(self):
498
        """
499
        List the input files (for single-valued :py:attr:`input_file_grp`).
500
501
        For each physical page:
502
503
        - If there is a single PAGE-XML for the page, take it (and forget about all
504
          other files for that page)
505
        - Else if there is a single image file, take it (and forget about all other
506
          files for that page)
507
        - Otherwise raise an error (complaining that only PAGE-XML warrants
508
          having multiple images for a single page)
509
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
510
511
        Returns:
512
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
513
        """
514
        if not self.input_file_grp:
515
            raise ValueError("Processor is missing input fileGrp")
516
        ret = self.zip_input_files(mimetype=None, on_error='abort')
517
        if not ret:
518
            return []
519
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
520
        return [tuples[0] for tuples in ret]
521
522
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
523
        """
524
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
525
526
        Processors that expect/need multiple input file groups,
527
        cannot use :py:data:`input_files`. They must align (zip) input files
528
        across pages. This includes the case where not all pages
529
        are equally present in all file groups. It also requires
530
        making a consistent selection if there are multiple files
531
        per page.
532
533
        Following the OCR-D functional model, this function tries to
534
        find a single PAGE file per page, or fall back to a single
535
        image file per page. In either case, multiple matches per page
536
        are an error (see error handling below).
537
        This default behaviour can be changed by using a fixed MIME
538
        type filter via :py:attr:`mimetype`. But still, multiple matching
539
        files per page are an error.
540
541
        Single-page multiple-file errors are handled according to
542
        :py:attr:`on_error`:
543
544
        - if ``skip``, then the page for the respective fileGrp will be
545
          silently skipped (as if there was no match at all)
546
        - if ``first``, then the first matching file for the page will be
547
          silently selected (as if the first was the only match)
548
        - if ``last``, then the last matching file for the page will be
549
          silently selected (as if the last was the only match)
550
        - if ``abort``, then an exception will be raised.
551
        Multiple matches for PAGE-XML will always raise an exception.
552
553
        Keyword Args:
554
             require_first (boolean): If true, then skip a page entirely
555
                 whenever it is not available in the first input `fileGrp`.
556
             mimetype (string): If not `None`, filter by the specified MIME
557
                 type (literal or regex prefixed by `//`). Otherwise prefer
558
                 PAGE or image.
559
        Returns:
560
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
561
        """
562
        if not self.input_file_grp:
563
            raise ValueError("Processor is missing input fileGrp")
564
565
        LOG = getLogger('ocrd.processor.base')
566
        ifgs = self.input_file_grp.split(",")
567
        # Iterating over all files repeatedly may seem inefficient at first sight,
568
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
569
        # can actually be much more costly than traversing the ltree.
570
        # This might depend on the number of pages vs number of fileGrps.
571
572
        pages = dict()
573
        for i, ifg in enumerate(ifgs):
574
            files_ = sorted(self.workspace.mets.find_all_files(
575
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
576
                                # sort by MIME type so PAGE comes before images
577
                                key=lambda file_: file_.mimetype)
578
            # Warn if no files found but pageId was specified because that
579
            # might be because of invalid page_id (range)
580
            if self.page_id and not files_:
581
                msg = (f"Could not find any files for --page-id {self.page_id} - "
582
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
583
                if on_error == 'abort':
584
                    raise ValueError(msg)
585
                LOG.warning(msg)
586
            for file_ in files_:
587
                if not file_.pageId:
588
                    continue
589
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
590
                if ift[i]:
591
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
592
                    # fileGrp has multiple files for this page ID
593
                    if mimetype:
594
                        # filter was active, this must not happen
595 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
596
                            ift[i] = None
597
                        elif on_error == 'first':
598
                            pass # keep first match
599
                        elif on_error == 'last':
600
                            ift[i] = file_
601
                        elif on_error == 'abort':
602
                            raise ValueError(
603
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
604
                                    mimetype, file_.pageId, ifg))
605
                        else:
606
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
607
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
608
                          file_.mimetype != MIMETYPE_PAGE):
609
                        pass # keep PAGE match
610
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
611
                          file_.mimetype == MIMETYPE_PAGE):
612
                        raise ValueError(
613
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
614
                                file_.pageId, ifg))
615
                    else:
616
                        # filter was inactive but no PAGE is in control, this must not happen
617 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
618
                            ift[i] = None
619
                        elif on_error == 'first':
620
                            pass # keep first match
621
                        elif on_error == 'last':
622
                            ift[i] = file_
623
                        elif on_error == 'abort':
624
                            raise ValueError(
625
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
626
                                    file_.pageId, ifg))
627
                        else:
628
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
629
                else:
630
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
631
                    ift[i] = file_
632
        ifts = list()
633
        for page, ifiles in pages.items():
634
            for i, ifg in enumerate(ifgs):
635
                if not ifiles[i]:
636
                    # other fallback options?
637
                    LOG.error('found no page %s in file group %s',
638
                              page, ifg)
639
            if ifiles[0] or not require_first:
640
                ifts.append(tuple(ifiles))
641
        return ifts
642