Passed
Push — master ( 44556e...06b451 )
by Konstantin
43s queued 11s
created

ocrd.processor.base   A

Complexity

Total Complexity 31

Size/Duplication

Total Lines 219
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 31
eloc 150
dl 0
loc 219
rs 9.92
c 0
b 0
f 0

4 Functions

Rating   Name   Duplication   Size   Complexity  
A _get_workspace() 0 8 4
B run_processor() 0 48 1
B run_cli() 0 30 6
B generate_processor_help() 0 45 7

6 Methods

Rating   Name   Duplication   Size   Complexity  
A Processor.show_version() 0 2 1
A Processor.process() 0 5 1
B Processor.__init__() 0 40 8
A Processor.verify() 0 5 1
A Processor.input_files() 0 6 1
A Processor.show_help() 0 2 1
1
import os
2
import json
3
from click import wrap_text
4
import subprocess
5
from ocrd_utils import getLogger, VERSION as OCRD_VERSION
6
from ocrd_validators import ParameterValidator
7
8
log = getLogger('ocrd.processor')
9
10
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
11
    if workspace is None:
12
        if resolver is None:
13
            raise Exception("Need to pass a resolver to create a workspace")
14
        if mets_url is None:
15
            raise Exception("Need to pass mets_url to create a workspace")
16
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
17
    return workspace
18
19
def run_processor(
20
        processorClass,
21
        ocrd_tool=None,
22
        mets_url=None,
23
        resolver=None,
24
        workspace=None,
25
        page_id=None,
26
        log_level=None,         # TODO actually use this!
27
        input_file_grp=None,
28
        output_file_grp=None,
29
        parameter=None,
30
        working_dir=None,
31
): # pylint: disable=too-many-locals
32
    """
33
    Create a workspace for mets_url and run processor through it
34
35
    Args:
36
        parameter (string): URL to the parameter
37
    """
38
    workspace = _get_workspace(
39
        workspace,
40
        resolver,
41
        mets_url,
42
        working_dir
43
    )
44
    log.debug("Running processor %s", processorClass)
45
    processor = processorClass(
46
        workspace,
47
        ocrd_tool=ocrd_tool,
48
        page_id=page_id,
49
        input_file_grp=input_file_grp,
50
        output_file_grp=output_file_grp,
51
        parameter=parameter
52
    )
53
    ocrd_tool = processor.ocrd_tool
54
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
55
    otherrole = ocrd_tool['steps'][0]
56
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
57
    processor.process()
58
    workspace.mets.add_agent(
59
        name=name,
60
        _type='OTHER',
61
        othertype='SOFTWARE',
62
        role='OTHER',
63
        otherrole=otherrole
64
    )
65
    workspace.save_mets()
66
    return processor
67
68
def run_cli(
69
        executable,
70
        mets_url=None,
71
        resolver=None,
72
        workspace=None,
73
        page_id=None,
74
        log_level=None,
75
        input_file_grp=None,
76
        output_file_grp=None,
77
        parameter=None,
78
        working_dir=None,
79
):
80
    """
81
    Create a workspace for mets_url and run MP CLI through it
82
    """
83
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
84
    args = [executable, '--working-dir', workspace.directory]
85
    args += ['--mets', mets_url]
86
    if log_level:
87
        args += ['--log-level', log_level]
88
    if page_id:
89
        args += ['--page-id', page_id]
90
    if input_file_grp:
91
        args += ['--input-file-grp', input_file_grp]
92
    if output_file_grp:
93
        args += ['--output-file-grp', output_file_grp]
94
    if parameter:
95
        args += ['--parameter', parameter]
96
    log.debug("Running subprocess '%s'", ' '.join(args))
97
    return subprocess.call(args)
98
99
def generate_processor_help(ocrd_tool):
100
    parameter_help = ''
101
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
102
        parameter_help = '  NONE\n'
103
    else:
104
        for param_name, param in ocrd_tool['parameters'].items():
105
            parameter_help += wrap_text('  "%s" [%s%s] %s%s' % (
106
                param_name,
107
                param['type'],
108
                ' - REQUIRED' if 'required' in param and param['required'] else
109
                ' - %s' % param['default'] if 'default' in param else '',
110
                param['description'],
111
                ' Possible values: %s' % json.dumps(param['enum']) if 'enum' in param else ''
112
            ), subsequent_indent='    ', width=72, preserve_paragraphs=True)
113
            parameter_help += "\n"
114
    return '''
115
Usage: %s [OPTIONS]
116
117
  %s
118
119
Options:
120
  -V, --version                   Show version
121
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
122
                                  Log level
123
  -J, --dump-json                 Dump tool description as JSON and exit
124
  -p, --parameter TEXT            Parameters, either JSON string or path 
125
                                  JSON file
126
  -g, --page-id TEXT              ID(s) of the pages to process
127
  -O, --output-file-grp TEXT      File group(s) used as output.
128
  -I, --input-file-grp TEXT       File group(s) used as input.
129
  -w, --working-dir TEXT          Working Directory
130
  -m, --mets TEXT                 METS to process
131
  -h, --help                      This help message
132
133
Parameters:
134
%s
135
Default Wiring:
136
  %s -> %s
137
138
''' % (
139
    ocrd_tool['executable'],
140
    ocrd_tool['description'],
141
    parameter_help,
142
    ocrd_tool['input_file_grp'],
143
    ocrd_tool.get('output_file_grp', 'NONE')
144
)
145
146
147
class Processor():
148
    """
149
    A processor runs an algorithm based on the workspace, the mets.xml in the
150
    workspace (and the input files defined therein) as well as optional
151
    parameter.
152
    """
153
154
    def __init__(
155
            self,
156
            workspace,
157
            ocrd_tool=None,
158
            parameter=None,
159
            input_file_grp="INPUT",
160
            output_file_grp="OUTPUT",
161
            page_id=None,
162
            show_help=False,
163
            show_version=False,
164
            dump_json=False,
165
            version=None
166
    ):
167
        if parameter is None:
168
            parameter = {}
169
        if dump_json:
170
            print(json.dumps(ocrd_tool, indent=True))
171
            return
172
        self.ocrd_tool = ocrd_tool
173
        if show_help:
174
            self.show_help()
175
            return
176
        self.version = version
177
        if show_version:
178
            self.show_version()
179
            return
180
        self.workspace = workspace
181
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
182
        # but there is no way to do that in process here since it's an
183
        # overridden method. chdir is almost always an anti-pattern.
184
        if self.workspace:
185
            os.chdir(self.workspace.directory)
186
        self.input_file_grp = input_file_grp
187
        self.output_file_grp = output_file_grp
188
        self.page_id = None if page_id == [] or page_id is None else page_id
189
        parameterValidator = ParameterValidator(ocrd_tool)
190
        report = parameterValidator.validate(parameter)
191
        if not report.is_valid:
192
            raise Exception("Invalid parameters %s" % report.errors)
193
        self.parameter = parameter
194
195
    def show_help(self):
196
        print(generate_processor_help(self.ocrd_tool))
197
198
    def show_version(self):
199
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
200
201
    def verify(self):
202
        """
203
        Verify that the input fulfills the processor's requirements.
204
        """
205
        return True
206
207
    def process(self):
208
        """
209
        Process the workspace
210
        """
211
        raise Exception("Must be implemented")
212
213
    @property
214
    def input_files(self):
215
        """
216
        List the input files
217
        """
218
        return self.workspace.mets.find_files(fileGrp=self.input_file_grp, pageId=self.page_id)
219