|
1
|
|
|
import os |
|
2
|
|
|
import json |
|
3
|
|
|
from click import wrap_text |
|
4
|
|
|
import subprocess |
|
5
|
|
|
from ocrd_utils import getLogger, VERSION as OCRD_VERSION |
|
6
|
|
|
from ocrd_validators import ParameterValidator |
|
7
|
|
|
|
|
8
|
|
|
log = getLogger('ocrd.processor') |
|
9
|
|
|
|
|
10
|
|
|
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None): |
|
11
|
|
|
if workspace is None: |
|
12
|
|
|
if resolver is None: |
|
13
|
|
|
raise Exception("Need to pass a resolver to create a workspace") |
|
14
|
|
|
if mets_url is None: |
|
15
|
|
|
raise Exception("Need to pass mets_url to create a workspace") |
|
16
|
|
|
workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir) |
|
17
|
|
|
return workspace |
|
18
|
|
|
|
|
19
|
|
|
def run_processor( |
|
20
|
|
|
processorClass, |
|
21
|
|
|
ocrd_tool=None, |
|
22
|
|
|
mets_url=None, |
|
23
|
|
|
resolver=None, |
|
24
|
|
|
workspace=None, |
|
25
|
|
|
page_id=None, |
|
26
|
|
|
log_level=None, # TODO actually use this! |
|
27
|
|
|
input_file_grp=None, |
|
28
|
|
|
output_file_grp=None, |
|
29
|
|
|
parameter=None, |
|
30
|
|
|
working_dir=None, |
|
31
|
|
|
): # pylint: disable=too-many-locals |
|
32
|
|
|
""" |
|
33
|
|
|
Create a workspace for mets_url and run processor through it |
|
34
|
|
|
|
|
35
|
|
|
Args: |
|
36
|
|
|
parameter (string): URL to the parameter |
|
37
|
|
|
""" |
|
38
|
|
|
workspace = _get_workspace( |
|
39
|
|
|
workspace, |
|
40
|
|
|
resolver, |
|
41
|
|
|
mets_url, |
|
42
|
|
|
working_dir |
|
43
|
|
|
) |
|
44
|
|
|
log.debug("Running processor %s", processorClass) |
|
45
|
|
|
processor = processorClass( |
|
46
|
|
|
workspace, |
|
47
|
|
|
ocrd_tool=ocrd_tool, |
|
48
|
|
|
page_id=page_id, |
|
49
|
|
|
input_file_grp=input_file_grp, |
|
50
|
|
|
output_file_grp=output_file_grp, |
|
51
|
|
|
parameter=parameter |
|
52
|
|
|
) |
|
53
|
|
|
ocrd_tool = processor.ocrd_tool |
|
54
|
|
|
name = '%s v%s' % (ocrd_tool['executable'], processor.version) |
|
55
|
|
|
otherrole = ocrd_tool['steps'][0] |
|
56
|
|
|
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) |
|
57
|
|
|
processor.process() |
|
58
|
|
|
workspace.mets.add_agent( |
|
59
|
|
|
name=name, |
|
60
|
|
|
_type='OTHER', |
|
61
|
|
|
othertype='SOFTWARE', |
|
62
|
|
|
role='OTHER', |
|
63
|
|
|
otherrole=otherrole |
|
64
|
|
|
) |
|
65
|
|
|
workspace.save_mets() |
|
66
|
|
|
return processor |
|
67
|
|
|
|
|
68
|
|
|
def run_cli( |
|
69
|
|
|
executable, |
|
70
|
|
|
mets_url=None, |
|
71
|
|
|
resolver=None, |
|
72
|
|
|
workspace=None, |
|
73
|
|
|
page_id=None, |
|
74
|
|
|
log_level=None, |
|
75
|
|
|
input_file_grp=None, |
|
76
|
|
|
output_file_grp=None, |
|
77
|
|
|
parameter=None, |
|
78
|
|
|
working_dir=None, |
|
79
|
|
|
): |
|
80
|
|
|
""" |
|
81
|
|
|
Create a workspace for mets_url and run MP CLI through it |
|
82
|
|
|
""" |
|
83
|
|
|
workspace = _get_workspace(workspace, resolver, mets_url, working_dir) |
|
84
|
|
|
args = [executable, '--working-dir', workspace.directory] |
|
85
|
|
|
args += ['--mets', mets_url] |
|
86
|
|
|
if log_level: |
|
87
|
|
|
args += ['--log-level', log_level] |
|
88
|
|
|
if page_id: |
|
89
|
|
|
args += ['--page-id', page_id] |
|
90
|
|
|
if input_file_grp: |
|
91
|
|
|
args += ['--input-file-grp', input_file_grp] |
|
92
|
|
|
if output_file_grp: |
|
93
|
|
|
args += ['--output-file-grp', output_file_grp] |
|
94
|
|
|
if parameter: |
|
95
|
|
|
args += ['--parameter', parameter] |
|
96
|
|
|
log.debug("Running subprocess '%s'", ' '.join(args)) |
|
97
|
|
|
return subprocess.call(args) |
|
98
|
|
|
|
|
99
|
|
|
def generate_processor_help(ocrd_tool): |
|
100
|
|
|
parameter_help = '' |
|
101
|
|
|
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: |
|
102
|
|
|
parameter_help = ' NONE\n' |
|
103
|
|
|
else: |
|
104
|
|
|
for param_name, param in ocrd_tool['parameters'].items(): |
|
105
|
|
|
parameter_help += wrap_text(' "%s" [%s%s] %s%s' % ( |
|
106
|
|
|
param_name, |
|
107
|
|
|
param['type'], |
|
108
|
|
|
' - REQUIRED' if 'required' in param and param['required'] else |
|
109
|
|
|
' - %s' % param['default'] if 'default' in param else '', |
|
110
|
|
|
param['description'], |
|
111
|
|
|
' Possible values: %s' % json.dumps(param['enum']) if 'enum' in param else '' |
|
112
|
|
|
), subsequent_indent=' ', width=72, preserve_paragraphs=True) |
|
113
|
|
|
parameter_help += "\n" |
|
114
|
|
|
return ''' |
|
115
|
|
|
Usage: %s [OPTIONS] |
|
116
|
|
|
|
|
117
|
|
|
%s |
|
118
|
|
|
|
|
119
|
|
|
Options: |
|
120
|
|
|
-V, --version Show version |
|
121
|
|
|
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] |
|
122
|
|
|
Log level |
|
123
|
|
|
-J, --dump-json Dump tool description as JSON and exit |
|
124
|
|
|
-p, --parameter TEXT Parameters, either JSON string or path |
|
125
|
|
|
JSON file |
|
126
|
|
|
-g, --page-id TEXT ID(s) of the pages to process |
|
127
|
|
|
-O, --output-file-grp TEXT File group(s) used as output. |
|
128
|
|
|
-I, --input-file-grp TEXT File group(s) used as input. |
|
129
|
|
|
-w, --working-dir TEXT Working Directory |
|
130
|
|
|
-m, --mets TEXT METS to process |
|
131
|
|
|
-h, --help This help message |
|
132
|
|
|
|
|
133
|
|
|
Parameters: |
|
134
|
|
|
%s |
|
135
|
|
|
Default Wiring: |
|
136
|
|
|
%s -> %s |
|
137
|
|
|
|
|
138
|
|
|
''' % ( |
|
139
|
|
|
ocrd_tool['executable'], |
|
140
|
|
|
ocrd_tool['description'], |
|
141
|
|
|
parameter_help, |
|
142
|
|
|
ocrd_tool['input_file_grp'], |
|
143
|
|
|
ocrd_tool.get('output_file_grp', 'NONE') |
|
144
|
|
|
) |
|
145
|
|
|
|
|
146
|
|
|
|
|
147
|
|
|
class Processor(): |
|
148
|
|
|
""" |
|
149
|
|
|
A processor runs an algorithm based on the workspace, the mets.xml in the |
|
150
|
|
|
workspace (and the input files defined therein) as well as optional |
|
151
|
|
|
parameter. |
|
152
|
|
|
""" |
|
153
|
|
|
|
|
154
|
|
|
def __init__( |
|
155
|
|
|
self, |
|
156
|
|
|
workspace, |
|
157
|
|
|
ocrd_tool=None, |
|
158
|
|
|
parameter=None, |
|
159
|
|
|
input_file_grp="INPUT", |
|
160
|
|
|
output_file_grp="OUTPUT", |
|
161
|
|
|
page_id=None, |
|
162
|
|
|
show_help=False, |
|
163
|
|
|
show_version=False, |
|
164
|
|
|
dump_json=False, |
|
165
|
|
|
version=None |
|
166
|
|
|
): |
|
167
|
|
|
if parameter is None: |
|
168
|
|
|
parameter = {} |
|
169
|
|
|
if dump_json: |
|
170
|
|
|
print(json.dumps(ocrd_tool, indent=True)) |
|
171
|
|
|
return |
|
172
|
|
|
self.ocrd_tool = ocrd_tool |
|
173
|
|
|
if show_help: |
|
174
|
|
|
self.show_help() |
|
175
|
|
|
return |
|
176
|
|
|
self.version = version |
|
177
|
|
|
if show_version: |
|
178
|
|
|
self.show_version() |
|
179
|
|
|
return |
|
180
|
|
|
self.workspace = workspace |
|
181
|
|
|
# FIXME HACK would be better to use pushd_popd(self.workspace.directory) |
|
182
|
|
|
# but there is no way to do that in process here since it's an |
|
183
|
|
|
# overridden method. chdir is almost always an anti-pattern. |
|
184
|
|
|
if self.workspace: |
|
185
|
|
|
os.chdir(self.workspace.directory) |
|
186
|
|
|
self.input_file_grp = input_file_grp |
|
187
|
|
|
self.output_file_grp = output_file_grp |
|
188
|
|
|
self.page_id = None if page_id == [] or page_id is None else page_id |
|
189
|
|
|
parameterValidator = ParameterValidator(ocrd_tool) |
|
190
|
|
|
report = parameterValidator.validate(parameter) |
|
191
|
|
|
if not report.is_valid: |
|
192
|
|
|
raise Exception("Invalid parameters %s" % report.errors) |
|
193
|
|
|
self.parameter = parameter |
|
194
|
|
|
|
|
195
|
|
|
def show_help(self): |
|
196
|
|
|
print(generate_processor_help(self.ocrd_tool)) |
|
197
|
|
|
|
|
198
|
|
|
def show_version(self): |
|
199
|
|
|
print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION)) |
|
200
|
|
|
|
|
201
|
|
|
def verify(self): |
|
202
|
|
|
""" |
|
203
|
|
|
Verify that the input fulfills the processor's requirements. |
|
204
|
|
|
""" |
|
205
|
|
|
return True |
|
206
|
|
|
|
|
207
|
|
|
def process(self): |
|
208
|
|
|
""" |
|
209
|
|
|
Process the workspace |
|
210
|
|
|
""" |
|
211
|
|
|
raise Exception("Must be implemented") |
|
212
|
|
|
|
|
213
|
|
|
@property |
|
214
|
|
|
def input_files(self): |
|
215
|
|
|
""" |
|
216
|
|
|
List the input files |
|
217
|
|
|
""" |
|
218
|
|
|
return self.workspace.mets.find_files(fileGrp=self.input_file_grp, pageId=self.page_id) |
|
219
|
|
|
|