Passed
Pull Request — master (#1240)
by
unknown
02:52
created

ocrd.processor.base.Processor.input_files()   A

Complexity

Conditions 3

Size

Total Lines 25
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 25
rs 9.95
c 0
b 0
f 0
cc 3
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os.path import exists
13
from shutil import copyfileobj
14
import json
15
import os
16
from os import getcwd
17
from pathlib import Path
18
from typing import Optional
19
import sys
20
import tarfile
21
import io
22
from deprecated import deprecated
23
24
from ocrd.workspace import Workspace
25
from ocrd_utils import (
26
    VERSION as OCRD_VERSION,
27
    MIMETYPE_PAGE,
28
    MIME_TO_EXT,
29
    getLogger,
30
    initLogging,
31
    list_resource_candidates,
32
    pushd_popd,
33
    list_all_resources,
34
    get_processor_resource_types,
35
    resource_filename,
36
    make_file_id,
37
)
38
from ocrd_validators import ParameterValidator
39
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
40
from ocrd_modelfactory import page_from_file
41
42
# XXX imports must remain for backwards-compatibility
43
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
44
45
class ResourceNotFoundError(FileNotFoundError):
46
    """
47
    An exception signifying the requested processor resource
48
    cannot be resolved.
49
    """
50
    def __init__(self, name, executable):
51
        self.name = name
52
        self.executable = executable
53
        self.message = "Could not find resource '%s' for executable '%s'. " \
54
                       "Try 'ocrd resmgr download %s %s' to download this resource." \
55
                       % (name, executable, executable, name)
56
        super().__init__(self.message)
57
58
class Processor():
59
    """
60
    A processor is a tool that implements the uniform OCR-D command-line interface
61
    for run-time data processing. That is, it executes a single workflow step,
62
    or a combination of workflow steps, on the workspace (represented by local METS).
63
    It reads input files for all or requested physical pages of the input fileGrp(s),
64
    and writes output files for them into the output fileGrp(s). It may take 
65
    a number of optional or mandatory parameters.
66
    """
67
68
    def __init__(
69
            self,
70
            # FIXME: deprecate in favor of process_workspace(workspace)
71
            workspace : Optional[Workspace],
72
            ocrd_tool=None,
73
            parameter=None,
74
            input_file_grp=None,
75
            output_file_grp=None,
76
            page_id=None,
77
            download_files=True,
78
            # FIXME: deprecate all the following in favor of respective methods
79
            resolve_resource=None,
80
            show_resource=None,
81
            list_resources=False,
82
            show_help=False,
83
            subcommand=None,
84
            show_version=False,
85
            dump_json=False,
86
            dump_module_dir=False,
87
            version=None
88
    ):
89
        """
90
        Instantiate, but do not process. Unless ``list_resources`` or
91
        ``show_resource`` or ``show_help`` or ``show_version`` or
92
        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
93
        (parsing and validating parameters, entering the workspace directory).
94
95
        Args:
96
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
97
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
98
                 but then needs to be set before running.
99
        Keyword Args:
100
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
101
                 Can be ``None`` for processing, but needs to be set before running.
102
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
103
                 Can be ``None`` even for processing, but then needs to be set before running.
104
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
105
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
106
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
107
                 (or empty for all pages).
108
             download_files (boolean): Whether input files will be downloaded prior to processing.
109
             resolve_resource (string): If not ``None``, then instead of processing, resolve \
110
                 given resource by name and print its full path to stdout.
111
             show_resource (string): If not ``None``, then instead of processing, resolve \
112
                 given resource by name and print its contents to stdout.
113
             list_resources (boolean): If true, then instead of processing, find all installed \
114
                 resource files in the search paths and print their path names.
115
             show_help (boolean): If true, then instead of processing, print a usage description \
116
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
117
                 docstrings.
118
             subcommand (string): 'worker' or 'server', only used here for the right --help output
119
             show_version (boolean): If true, then instead of processing, print information on \
120
                 this processor's version and OCR-D version. Exit afterwards.
121
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
122
                 on stdout.
123
             dump_module_dir (boolean): If true, then instead of processing, print :py:attr:`moduledir` \
124
                 on stdout.
125
        """
126
        self.ocrd_tool = ocrd_tool
127
        if parameter is None:
128
            parameter = {}
129
        if dump_json:
130
            print(json.dumps(ocrd_tool, indent=True))
131
            return
132
        if dump_module_dir:
133
            print(self.moduledir)
134
            return
135
        if list_resources:
136
            for res in self.list_all_resources():
137
                print(res)
138
            return
139
        if resolve_resource:
140
            try:
141
                res = self.resolve_resource(resolve_resource)
142
                print(res)
143
            except ResourceNotFoundError as e:
144
                log = getLogger('ocrd.processor.base')
145
                log.critical(e.message)
146
                sys.exit(1)
147
            return
148
        if show_resource:
149
            try:
150
                self.show_resource(show_resource)
151
            except ResourceNotFoundError as e:
152
                log = getLogger('ocrd.processor.base')
153
                log.critical(e.message)
154
                sys.exit(1)
155
            return
156
        if show_help:
157
            self.show_help(subcommand=subcommand)
158
            return
159
        self.version = version
160
        if show_version:
161
            self.show_version()
162
            return
163
        self.workspace = workspace
164
        if self.workspace:
165
            # FIXME deprecate setting this and calling process() over using process_workspace()
166
            # which uses pushd_popd(self.workspace.directory)
167
            # (because there is no way to do that in process() since it's an
168
            # overridden method. chdir is almost always an anti-pattern.)
169
            self.old_pwd = getcwd()
170
            os.chdir(self.workspace.directory)
171
        self.input_file_grp = input_file_grp
172
        self.output_file_grp = output_file_grp
173
        self.page_id = None if page_id == [] or page_id is None else page_id
174
        self.download = download_files
175
        parameterValidator = ParameterValidator(ocrd_tool)
176
        report = parameterValidator.validate(parameter)
177
        if not report.is_valid:
178
            raise Exception("Invalid parameters %s" % report.errors)
179
        self.parameter = parameter
180
        # workaround for deprecated#72 (deprecation does not work for subclasses):
181
        setattr(self, 'process',
182
                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
183
184
    def show_help(self, subcommand=None):
185
        print(generate_processor_help(self.ocrd_tool, processor_instance=self, subcommand=subcommand))
186
187
    def show_version(self):
188
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
189
190
    def verify(self):
191
        """
192
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
193
        """
194
        return True
195
196
    def setup(self) -> None:
197
        """
198
        Prepare the processor for actual data processing,
199
        prior to changing to the workspace directory but
200
        after parsing parameters.
201
202
        (Override this to load models into memory etc.)
203
        """
204
        pass
205
206
    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
207
    def process(self) -> None:
208
        """
209
        Process all files of the :py:attr:`workspace` 
210
        from the given :py:attr:`input_file_grp`
211
        to the given :py:attr:`output_file_grp`
212
        for the given :py:attr:`page_id` (or all pages)
213
        under the given :py:attr:`parameter`.
214
        
215
        (This contains the main functionality and needs to be overridden by subclasses.)
216
        """
217
        raise NotImplementedError()
218
219
    def process_workspace(self, workspace: Workspace) -> None:
220
        """
221
        Process all files of the given ``workspace``,
222
        from the given :py:attr:`input_file_grp`
223
        to the given :py:attr:`output_file_grp`
224
        for the given :py:attr:`page_id` (or all pages)
225
        under the given :py:attr:`parameter`.
226
227
        (This will iterate over pages and files, calling
228
        :py:meth:`process_page`, handling exceptions.)
229
        """
230
        # assert self.input_file_grp is not None
231
        # assert self.output_file_grp is not None
232
        # input_file_grps = self.input_file_grp.split(',')
233
        # for input_file_grp in input_file_grps:
234
        #     assert input_file_grp in workspace.mets.file_groups
235
        log = getLogger('ocrd.processor.base')
236
        with pushd_popd(workspace.directory):
237
            self.workspace = workspace
238
            try:
239
                # FIXME: add page parallelization by running multiprocessing.Pool (#322)
240
                for input_file_tuple in self.zip_input_files(on_error='abort'):
241
                    # FIXME: add error handling by catching exceptions in various ways (#579)
242
                    # for example:
243
                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
244
                    # - transient (I/O or OOM) error → maybe sleep, retry
245
                    # - persistent (data) error → skip / dummy / raise
246
                    input_files = [None] * len(input_file_tuple)
247
                    for i, input_file in enumerate(input_file_tuple):
248
                        if i == 0:
249
                            log.info("processing page %s", input_file.pageId)
250
                        elif input_file is None:
251
                            # file/page not found in this file grp
252
                            continue
253
                        input_files[i] = input_file
254
                        if not self.download:
255
                            continue
256
                        try:
257
                            input_files[i] = self.workspace.download_file(input_file)
258
                        except ValueError as e:
259
                            log.error(repr(e))
260
                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
261
                    self.process_page_file(*input_files)
262
            except NotImplementedError:
263
                # fall back to deprecated method
264
                self.process()
265
266
    def process_page_file(self, *input_files) -> None:
267
        """
268
        Process the given ``input_files`` of the :py:attr:`workspace`,
269
        representing one physical page (passed as one opened
270
        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
271
        under the given :py:attr:`parameter`, and make sure the
272
        results get added accordingly.
273
274
        (This uses process_page_pcgts, but can be overridden by subclasses
275
        to handle cases like multiple fileGrps, non-PAGE input etc.)
276
        """
277
        log = getLogger('ocrd.processor.base')
278
        input_pcgts = [None] * len(input_files)
279
        page_id = input_files[0].pageId
280
        for i, input_file in enumerate(input_files):
281
            # FIXME: what about non-PAGE input like image or JSON ???
282
            log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
283
            try:
284
                input_pcgts[i] = page_from_file(input_file)
285
            except ValueError as e:
286
                log.info("non-PAGE input for page %s: %s", page_id, e)
287
        output_file_id = make_file_id(input_files[0], self.output_file_grp)
288
        output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
289
        if isinstance(output_pcgts, (list, tuple)):
290
            output_images = output_pcgts[1:]
291
            output_pcgts = output_pcgts[0]
292
            for output_image_pil, output_image_id, output_image_path in output_images:
293
                self.workspace.save_image_file(
294
                    output_image_pil,
295
                    output_image_id,
296
                    self.output_file_grp,
297
                    page_id=page_id,
298
                    file_path=output_image_path)
299
        output_pcgts.set_pcGtsId(output_file_id)
300
        self.add_metadata(output_pcgts)
301
        # FIXME: what about non-PAGE output like JSON ???
302
        self.workspace.add_file(file_id=output_file_id,
303
                                file_grp=self.output_file_grp,
304
                                page_id=page_id,
305
                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
306
                                mimetype=MIMETYPE_PAGE,
307
                                content=to_xml(output_pcgts))
308
309
    def process_page_pcgts(self, *input_pcgts, output_file_id : str = None, page_id : str = None) -> OcrdPage:
310
        """
311
        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
312
        representing one physical page (passed as one parsed
313
        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
314
        under the given :py:attr:`parameter`, and return the
315
        resulting :py:class:`~ocrd_models.OcrdPage`.
316
317
        Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
318
        and one or more lists or tuples of :py:class:`PIL.Image` (image data),
319
        :py:class:str (file ID) and :py:class:str (file path) of derived images
320
        to be annotated along with the resulting PAGE file.
321
322
        (This contains the main functionality and must be overridden by subclasses.)
323
        """
324
        raise NotImplementedError()
325
326
    def add_metadata(self, pcgts: OcrdPage) -> None:
327
        """
328
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
329
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
330
        """
331
        pcgts.get_Metadata().add_MetadataItem(
332
                MetadataItemType(type_="processingStep",
333
                    name=self.ocrd_tool['steps'][0],
334
                    value=self.ocrd_tool['executable'],
335
                    Labels=[LabelsType(
336
                        externalModel="ocrd-tool",
337
                        externalId="parameters",
338
                        Label=[LabelType(type_=name,
339
                                         value=self.parameter[name])
340
                               for name in self.parameter.keys()]),
341
                            LabelsType(
342
                        externalModel="ocrd-tool",
343
                        externalId="version",
344
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
345
                                         value=self.version),
346
                               LabelType(type_='ocrd/core',
347
                                         value=OCRD_VERSION)])
348
                    ]))
349
350
    def resolve_resource(self, val):
351
        """
352
        Resolve a resource name to an absolute file path with the algorithm in
353
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
354
355
        Args:
356
            val (string): resource value to resolve
357
        """
358
        initLogging()
359
        executable = self.ocrd_tool['executable']
360
        log = getLogger('ocrd.processor.base')
361
        if exists(val):
362
            log.debug("Resolved to absolute path %s" % val)
363
            return val
364
        if hasattr(self, 'old_pwd'):
365
            cwd = self.old_pwd
366
        else:
367
            cwd = getcwd()
368
        ret = [cand for cand in list_resource_candidates(executable, val,
369
                                                         cwd=cwd, moduled=self.moduledir)
370
               if exists(cand)]
371
        if ret:
372
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
373
            return ret[0]
374
        raise ResourceNotFoundError(val, executable)
375
376
    def show_resource(self, val):
377
        res_fname = self.resolve_resource(val)
378
        fpath = Path(res_fname)
379
        if fpath.is_dir():
380
            with pushd_popd(fpath):
381
                fileobj = io.BytesIO()
382
                with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
383
                    tarball.add('.')
384
                fileobj.seek(0)
385
                copyfileobj(fileobj, sys.stdout.buffer)
386
        else:
387
            sys.stdout.buffer.write(fpath.read_bytes())
388
389
    def list_all_resources(self):
390
        """
391
        List all resources found in the filesystem and matching content-type by filename suffix
392
        """
393
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
394
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
395
            res = Path(res)
396
            if not '*/*' in mimetypes:
397
                if res.is_dir() and not 'text/directory' in mimetypes:
398
                    continue
399
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
400
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
401
                                             for mime in mimetypes):
402
                    continue
403
            yield res
404
405
    @property
406
    def module(self):
407
        """
408
        The top-level module this processor belongs to.
409
        """
410
        # find shortest prefix path that is not just a namespace package
411
        fqname = ''
412
        for name in self.__module__.split('.'):
413
            if fqname:
414
                fqname += '.'
415
            fqname += name
416
            if getattr(sys.modules[fqname], '__file__', None):
417
                return fqname
418
        # fall-back
419
        return self.__module__
420
421
    @property
422
    def moduledir(self):
423
        """
424
        The filesystem path of the module directory.
425
        """
426
        return resource_filename(self.module, '.')
427
428
    @property
429
    def input_files(self):
430
        """
431
        List the input files (for single-valued :py:attr:`input_file_grp`).
432
433
        For each physical page:
434
435
        - If there is a single PAGE-XML for the page, take it (and forget about all
436
          other files for that page)
437
        - Else if there is a single image file, take it (and forget about all other
438
          files for that page)
439
        - Otherwise raise an error (complaining that only PAGE-XML warrants
440
          having multiple images for a single page)
441
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
442
        
443
        Returns:
444
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
445
        """
446
        if not self.input_file_grp:
447
            raise ValueError("Processor is missing input fileGrp")
448
        ret = self.zip_input_files(mimetype=None, on_error='abort')
449
        if not ret:
450
            return []
451
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
452
        return [tuples[0] for tuples in ret]
453
454
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
455
        """
456
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
457
458
        Processors that expect/need multiple input file groups,
459
        cannot use :py:data:`input_files`. They must align (zip) input files
460
        across pages. This includes the case where not all pages
461
        are equally present in all file groups. It also requires
462
        making a consistent selection if there are multiple files
463
        per page.
464
465
        Following the OCR-D functional model, this function tries to
466
        find a single PAGE file per page, or fall back to a single
467
        image file per page. In either case, multiple matches per page
468
        are an error (see error handling below).
469
        This default behaviour can be changed by using a fixed MIME
470
        type filter via :py:attr:`mimetype`. But still, multiple matching
471
        files per page are an error.
472
473
        Single-page multiple-file errors are handled according to
474
        :py:attr:`on_error`:
475
476
        - if ``skip``, then the page for the respective fileGrp will be
477
          silently skipped (as if there was no match at all)
478
        - if ``first``, then the first matching file for the page will be
479
          silently selected (as if the first was the only match)
480
        - if ``last``, then the last matching file for the page will be
481
          silently selected (as if the last was the only match)
482
        - if ``abort``, then an exception will be raised.
483
        Multiple matches for PAGE-XML will always raise an exception.
484
485
        Keyword Args:
486
             require_first (boolean): If true, then skip a page entirely
487
                 whenever it is not available in the first input `fileGrp`.
488
             mimetype (string): If not `None`, filter by the specified MIME
489
                 type (literal or regex prefixed by `//`). Otherwise prefer
490
                 PAGE or image.
491
        Returns:
492
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
493
        """
494
        if not self.input_file_grp:
495
            raise ValueError("Processor is missing input fileGrp")
496
497
        LOG = getLogger('ocrd.processor.base')
498
        ifgs = self.input_file_grp.split(",")
499
        # Iterating over all files repeatedly may seem inefficient at first sight,
500
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
501
        # can actually be much more costly than traversing the ltree.
502
        # This might depend on the number of pages vs number of fileGrps.
503
504
        pages = dict()
505
        for i, ifg in enumerate(ifgs):
506
            files_ = sorted(self.workspace.mets.find_all_files(
507
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
508
                                # sort by MIME type so PAGE comes before images
509
                                key=lambda file_: file_.mimetype)
510
            # Warn if no files found but pageId was specified because that
511
            # might be because of invalid page_id (range)
512
            if self.page_id and not files_:
513
                msg = (f"Could not find any files for --page-id {self.page_id} - "
514
                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
515
                if on_error == 'abort':
516
                    raise ValueError(msg)
517
                LOG.warning(msg)
518
            for file_ in files_:
519
                if not file_.pageId:
520
                    continue
521
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
522
                if ift[i]:
523
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
524
                    # fileGrp has multiple files for this page ID
525
                    if mimetype:
526
                        # filter was active, this must not happen
527 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
528
                            ift[i] = None
529
                        elif on_error == 'first':
530
                            pass # keep first match
531
                        elif on_error == 'last':
532
                            ift[i] = file_
533
                        elif on_error == 'abort':
534
                            raise ValueError(
535
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
536
                                    mimetype, file_.pageId, ifg))
537
                        else:
538
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
539
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
540
                          file_.mimetype != MIMETYPE_PAGE):
541
                        pass # keep PAGE match
542
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
543
                          file_.mimetype == MIMETYPE_PAGE):
544
                        raise ValueError(
545
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
546
                                file_.pageId, ifg))
547
                    else:
548
                        # filter was inactive but no PAGE is in control, this must not happen
549 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
550
                            ift[i] = None
551
                        elif on_error == 'first':
552
                            pass # keep first match
553
                        elif on_error == 'last':
554
                            ift[i] = file_
555
                        elif on_error == 'abort':
556
                            raise ValueError(
557
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
558
                                    file_.pageId, ifg))
559
                        else:
560
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
561
                else:
562
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
563
                    ift[i] = file_
564
        ifts = list()
565
        for page, ifiles in pages.items():
566
            for i, ifg in enumerate(ifgs):
567
                if not ifiles[i]:
568
                    # other fallback options?
569
                    LOG.error('found no page %s in file group %s',
570
                              page, ifg)
571
            if ifiles[0] or not require_first:
572
                ifts.append(tuple(ifiles))
573
        return ifts
574