Passed
Push — master ( da9bd1...667072 )
by Konstantin
01:39
created

ocrd.processor.base.Processor.add_metadata()   A

Complexity

Conditions 1

Size

Total Lines 14
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 14
rs 9.85
c 0
b 0
f 0
cc 1
nop 2
1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = ['Processor', 'generate_processor_help', 'run_cli', 'run_processo']
6
7
import os
8
import json
9
from ocrd_utils import getLogger, VERSION as OCRD_VERSION, MIMETYPE_PAGE
10
from ocrd_validators import ParameterValidator
11
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
12
13
# XXX imports must remain for backwards-compatibilty
14
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
15
16
log = getLogger('ocrd.processor')
17
18
class Processor():
19
    """
20
    A processor runs an algorithm based on the workspace, the mets.xml in the
21
    workspace (and the input files defined therein) as well as optional
22
    parameter.
23
    """
24
25
    def __init__(
26
            self,
27
            workspace,
28
            ocrd_tool=None,
29
            parameter=None,
30
            # TODO OCR-D/core#274
31
            # input_file_grp=None,
32
            # output_file_grp=None,
33
            input_file_grp="INPUT",
34
            output_file_grp="OUTPUT",
35
            page_id=None,
36
            show_help=False,
37
            show_version=False,
38
            dump_json=False,
39
            version=None
40
    ):
41
        if parameter is None:
42
            parameter = {}
43
        if dump_json:
44
            print(json.dumps(ocrd_tool, indent=True))
45
            return
46
        self.ocrd_tool = ocrd_tool
47
        if show_help:
48
            self.show_help()
49
            return
50
        self.version = version
51
        if show_version:
52
            self.show_version()
53
            return
54
        self.workspace = workspace
55
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
56
        # but there is no way to do that in process here since it's an
57
        # overridden method. chdir is almost always an anti-pattern.
58
        if self.workspace:
59
            os.chdir(self.workspace.directory)
60
        self.input_file_grp = input_file_grp
61
        self.output_file_grp = output_file_grp
62
        self.page_id = None if page_id == [] or page_id is None else page_id
63
        parameterValidator = ParameterValidator(ocrd_tool)
64
        report = parameterValidator.validate(parameter)
65
        if not report.is_valid:
66
            raise Exception("Invalid parameters %s" % report.errors)
67
        self.parameter = parameter
68
69
    def show_help(self):
70
        print(generate_processor_help(self.ocrd_tool))
71
72
    def show_version(self):
73
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
74
75
    def verify(self):
76
        """
77
        Verify that the input fulfills the processor's requirements.
78
        """
79
        return True
80
81
    def process(self):
82
        """
83
        Process the workspace
84
        """
85
        raise Exception("Must be implemented")
86
87
    def add_metadata(self, pcgts):
88
        """
89
        Adds PAGE-XML MetadataItem describing the processing step
90
        """
91
        pcgts.get_Metadata().add_MetadataItem(
92
                MetadataItemType(type_="processingStep",
93
                    name=self.ocrd_tool['steps'][0],
94
                    value=self.ocrd_tool['executable'],
95
                    Labels=[LabelsType(
96
                        externalModel="ocrd-tool",
97
                        externalId="parameters",
98
                        Label=[LabelType(type_=name,
99
                            value=self.parameter[name])
100
                            for name in self.parameter.keys()])]))
101
102
    @property
103
    def input_files(self):
104
        """
105
        List the input files.
106
107
        - If there's a PAGE-XML for the page, take it (and forget about all
108
          other files for that page)
109
        - Else if there's only one image, take it (and forget about all other
110
          files for that page)
111
        - Otherwise raise an error (complaining that only PAGE-XML warrants
112
113
          having multiple images for a single page)
114
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
115
        """
116
        ret = self.workspace.mets.find_files(
117
            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
118
        if ret:
119
            return ret
120
        ret = self.workspace.mets.find_files(
121
            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
122
        if self.page_id and len(ret) > 1:
123
            raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
124
                "for page '%s'" % self.page_id if self.page_id else '',
125
                self.input_file_grp
126
                ))
127
        return ret
128