Passed
Pull Request — master (#800)
by Konstantin
02:22
created

ocrd.processor.base.Processor.moduledir()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Processor base class and helper functions.
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from pkg_resources import resource_filename
13
from os.path import exists
14
from shutil import copyfileobj
15
import json
16
import os
17
from os import getcwd
18
from pathlib import Path
19
import sys
20
import tarfile
21
import io
22
23
from ocrd_utils import (
24
    VERSION as OCRD_VERSION,
25
    MIMETYPE_PAGE,
26
    MIME_TO_EXT,
27
    getLogger,
28
    initLogging,
29
    list_resource_candidates,
30
    pushd_popd,
31
    list_all_resources,
32
    get_processor_resource_types
33
)
34
from ocrd_validators import ParameterValidator
35
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
36
37
# XXX imports must remain for backwards-compatibilty
38
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
39
40
class Processor():
41
    """
42
    A processor is a tool that implements the uniform OCR-D command-line interface
43
    for run-time data processing. That is, it executes a single workflow step,
44
    or a combination of workflow steps, on the workspace (represented by local METS).
45
    It reads input files for all or requested physical pages of the input fileGrp(s),
46
    and writes output files for them into the output fileGrp(s). It may take 
47
    a number of optional or mandatory parameters.
48
    """
49
50
    def __init__(
51
            self,
52
            workspace,
53
            ocrd_tool=None,
54
            parameter=None,
55
            # TODO OCR-D/core#274
56
            # input_file_grp=None,
57
            # output_file_grp=None,
58
            input_file_grp="INPUT",
59
            output_file_grp="OUTPUT",
60
            page_id=None,
61
            show_resource=None,
62
            list_resources=False,
63
            show_help=False,
64
            show_version=False,
65
            dump_json=False,
66
            version=None
67
    ):
68
        """
69
        Instantiate, but do not process. Unless ``list_resources`` or
70
        ``show_resource`` or ``show_help`` or ``show_version`` or
71
        ``dump_json`` is true, setup for processing (parsing and
72
        validating parameters, entering the workspace directory).
73
74
        Args:
75
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
76
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
77
                 but then needs to be set before running.
78
        Keyword Args:
79
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
80
                 Can be ``None`` for processing, but needs to be set before running.
81
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
82
                 Can be ``None`` even for processing, but then needs to be set before running.
83
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
84
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
85
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
86
                 (or empty for all pages).
87
             show_resource (string): If not ``None``, then instead of processing, resolve \
88
                 given resource by name and print its contents to stdout.
89
             list_resources (boolean): If true, then instead of processing, find all installed \
90
                 resource files in the search paths and print their path names.
91
             show_help (boolean): If true, then instead of processing, print a usage description \
92
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
93
                 docstrings.
94
             show_version (boolean): If true, then instead of processing, print information on \
95
                 this processor's version and OCR-D version. Exit afterwards.
96
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
97
                 on stdout.
98
        """
99
        self.ocrd_tool = ocrd_tool
100
        if parameter is None:
101
            parameter = {}
102
        if dump_json:
103
            print(json.dumps(ocrd_tool, indent=True))
104
            return
105
        if list_resources:
106
            for res in self.list_all_resources():
107
                print(res)
108
            return
109
        if show_resource:
110
            initLogging()
111
            res_fname = self.resolve_resource(show_resource)
112
            fpath = Path(res_fname)
113
            if fpath.is_dir():
114
                with pushd_popd(fpath):
115
                    fileobj = io.BytesIO()
116
                    with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball:
117
                        tarball.add('.')
118
                    fileobj.seek(0)
119
                    copyfileobj(fileobj, sys.stdout.buffer)
120
            else:
121
                sys.stdout.buffer.write(fpath.read_bytes())
122
            return
123
        if show_help:
124
            self.show_help()
125
            return
126
        self.version = version
127
        if show_version:
128
            self.show_version()
129
            return
130
        self.workspace = workspace
131
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
132
        # but there is no way to do that in process here since it's an
133
        # overridden method. chdir is almost always an anti-pattern.
134
        if self.workspace:
135
            self.old_pwd = getcwd()
136
            os.chdir(self.workspace.directory)
137
        self.input_file_grp = input_file_grp
138
        self.output_file_grp = output_file_grp
139
        self.page_id = None if page_id == [] or page_id is None else page_id
140
        parameterValidator = ParameterValidator(ocrd_tool)
141
        report = parameterValidator.validate(parameter)
142
        if not report.is_valid:
143
            raise Exception("Invalid parameters %s" % report.errors)
144
        self.parameter = parameter
145
146
    def show_help(self):
147
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
148
149
    def show_version(self):
150
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
151
152
    def verify(self):
153
        """
154
        Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements.
155
        """
156
        return True
157
158
    def process(self):
159
        """
160
        Process the :py:attr:`workspace` 
161
        from the given :py:attr:`input_file_grp`
162
        to the given :py:attr:`output_file_grp`
163
        for the given :py:attr:`page_id`
164
        under the given :py:attr:`parameter`.
165
        
166
        (This contains the main functionality and needs to be overridden by subclasses.)
167
        """
168
        raise Exception("Must be implemented")
169
170
171
    def add_metadata(self, pcgts):
172
        """
173
        Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
174
        the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
175
        """
176
        pcgts.get_Metadata().add_MetadataItem(
177
                MetadataItemType(type_="processingStep",
178
                    name=self.ocrd_tool['steps'][0],
179
                    value=self.ocrd_tool['executable'],
180
                    Labels=[LabelsType(
181
                        externalModel="ocrd-tool",
182
                        externalId="parameters",
183
                        Label=[LabelType(type_=name,
184
                                         value=self.parameter[name])
185
                               for name in self.parameter.keys()]),
186
                            LabelsType(
187
                        externalModel="ocrd-tool",
188
                        externalId="version",
189
                        Label=[LabelType(type_=self.ocrd_tool['executable'],
190
                                         value=self.version),
191
                               LabelType(type_='ocrd/core',
192
                                         value=OCRD_VERSION)])
193
                    ]))
194
195
    def resolve_resource(self, val):
196
        """
197
        Resolve a resource name to an absolute file path with the algorithm in
198
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
199
200
        Args:
201
            val (string): resource value to resolve
202
        """
203
        executable = self.ocrd_tool['executable']
204
        log = getLogger('ocrd.%s.resolve_resource' % executable)
205
        if exists(val):
206
            log.debug("Resolved to absolute path %s" % val)
207
            return val
208
        if hasattr(self, 'old_pwd'):
209
            cwd = self.old_pwd
210
        else:
211
            cwd = getcwd()
212
        ret = [cand for cand in list_resource_candidates(executable, val,
213
                                                         cwd=cwd, moduled=self.moduledir)
214
               if exists(cand)]
215
        if ret:
216
            log.debug("Resolved %s to absolute path %s" % (val, ret[0]))
217
            return ret[0]
218
        log.error("Could not find resource '%s' for executable '%s'. "
219
                  "Try 'ocrd resmgr download %s %s' to download this resource.",
220
                  val, executable, executable, val)
221
        sys.exit(1)
222
223
    def list_all_resources(self):
224
        """
225
        List all resources found in the filesystem and matching content-type by filename suffix
226
        """
227
        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
228
        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
229
            res = Path(res)
230
            if not '*/*' in mimetypes:
231
                if res.is_dir() and not 'text/directory' in mimetypes:
232
                    continue
233
                # if we do not know all MIME types, then keep the file, otherwise require suffix match
234
                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
235
                                             for mime in mimetypes):
236
                    continue
237
            yield res
238
239
    @property
240
    def module(self):
241
        """
242
        The top-level module this processor belongs to.
243
        """
244
        return self.__module__.split('.')[0]
245
246
    @property
247
    def moduledir(self):
248
        """
249
        The filesystem path of the module directory.
250
        """
251
        return resource_filename(self.module, '')
252
253
    @property
254
    def input_files(self):
255
        """
256
        List the input files (for single-valued :py:attr:`input_file_grp`).
257
258
        For each physical page:
259
260
        - If there is a single PAGE-XML for the page, take it (and forget about all
261
          other files for that page)
262
        - Else if there is a single image file, take it (and forget about all other
263
          files for that page)
264
        - Otherwise raise an error (complaining that only PAGE-XML warrants
265
          having multiple images for a single page)
266
        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
267
        
268
        Returns:
269
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
270
        """
271
        if not self.input_file_grp:
272
            raise ValueError("Processor is missing input fileGrp")
273
        ret = self.zip_input_files(mimetype=None, on_error='abort')
274
        if not ret:
275
            return []
276
        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
277
        return [tuples[0] for tuples in ret]
278
279
    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
280
        """
281
        List tuples of input files (for multi-valued :py:attr:`input_file_grp`).
282
283
        Processors that expect/need multiple input file groups,
284
        cannot use :py:data:`input_files`. They must align (zip) input files
285
        across pages. This includes the case where not all pages
286
        are equally present in all file groups. It also requires
287
        making a consistent selection if there are multiple files
288
        per page.
289
290
        Following the OCR-D functional model, this function tries to
291
        find a single PAGE file per page, or fall back to a single
292
        image file per page. In either case, multiple matches per page
293
        are an error (see error handling below).
294
        This default behaviour can be changed by using a fixed MIME
295
        type filter via :py:attr:`mimetype`. But still, multiple matching
296
        files per page are an error.
297
298
        Single-page multiple-file errors are handled according to
299
        :py:attr:`on_error`:
300
301
        - if ``skip``, then the page for the respective fileGrp will be
302
          silently skipped (as if there was no match at all)
303
        - if ``first``, then the first matching file for the page will be
304
          silently selected (as if the first was the only match)
305
        - if ``last``, then the last matching file for the page will be
306
          silently selected (as if the last was the only match)
307
        - if ``abort``, then an exception will be raised.
308
        Multiple matches for PAGE-XML will always raise an exception.
309
310
        Keyword Args:
311
             require_first (boolean): If true, then skip a page entirely
312
                 whenever it is not available in the first input `fileGrp`.
313
             mimetype (string): If not `None`, filter by the specified MIME
314
                 type (literal or regex prefixed by `//`). Otherwise prefer
315
                 PAGE or image.
316
        Returns:
317
            A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples.
318
        """
319
        if not self.input_file_grp:
320
            raise ValueError("Processor is missing input fileGrp")
321
322
        LOG = getLogger('ocrd.processor.base')
323
        ifgs = self.input_file_grp.split(",")
324
        # Iterating over all files repeatedly may seem inefficient at first sight,
325
        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
326
        # can actually be much more costly than traversing the ltree.
327
        # This might depend on the number of pages vs number of fileGrps.
328
329
        pages = dict()
330
        for i, ifg in enumerate(ifgs):
331
            for file_ in sorted(self.workspace.mets.find_all_files(
332
                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
333
                                # sort by MIME type so PAGE comes before images
334
                                key=lambda file_: file_.mimetype):
335
                if not file_.pageId:
336
                    continue
337
                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
338
                if ift[i]:
339
                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
340
                    # fileGrp has multiple files for this page ID
341
                    if mimetype:
342
                        # filter was active, this must not happen
343 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
344
                            ift[i] = None
345
                        elif on_error == 'first':
346
                            pass # keep first match
347
                        elif on_error == 'last':
348
                            ift[i] = file_
349
                        elif on_error == 'abort':
350
                            raise ValueError(
351
                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
352
                                    mimetype, file_.pageId, ifg))
353
                        else:
354
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
355
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
356
                          file_.mimetype != MIMETYPE_PAGE):
357
                        pass # keep PAGE match
358
                    elif (ift[i].mimetype == MIMETYPE_PAGE and
359
                          file_.mimetype == MIMETYPE_PAGE):
360
                        raise ValueError(
361
                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
362
                                file_.pageId, ifg))
363
                    else:
364
                        # filter was inactive but no PAGE is in control, this must not happen
365 View Code Duplication
                        if on_error == 'skip':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
366
                            ift[i] = None
367
                        elif on_error == 'first':
368
                            pass # keep first match
369
                        elif on_error == 'last':
370
                            ift[i] = file_
371
                        elif on_error == 'abort':
372
                            raise ValueError(
373
                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
374
                                    file_.pageId, ifg))
375
                        else:
376
                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
377
                else:
378
                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
379
                    ift[i] = file_
380
        ifts = list()
381
        for page, ifiles in pages.items():
382
            for i, ifg in enumerate(ifgs):
383
                if not ifiles[i]:
384
                    # other fallback options?
385
                    LOG.error('found no page %s in file group %s',
386
                              page, ifg)
387
            if ifiles[0] or not require_first:
388
                ifts.append(tuple(ifiles))
389
        return ifts
390