|
1
|
|
|
""" |
|
2
|
|
|
Helper methods for running and documenting processors |
|
3
|
|
|
""" |
|
4
|
|
|
from time import time |
|
5
|
|
|
import json |
|
6
|
|
|
import subprocess |
|
7
|
|
|
|
|
8
|
|
|
from click import wrap_text |
|
9
|
|
|
from ocrd_utils import getLogger |
|
10
|
|
|
|
|
11
|
|
|
__all__ = [ |
|
12
|
|
|
'generate_processor_help', |
|
13
|
|
|
'run_cli', |
|
14
|
|
|
'run_processor' |
|
15
|
|
|
] |
|
16
|
|
|
|
|
17
|
|
|
log = getLogger('ocrd.processor') |
|
18
|
|
|
|
|
19
|
|
|
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None): |
|
20
|
|
|
if workspace is None: |
|
21
|
|
|
if resolver is None: |
|
22
|
|
|
raise Exception("Need to pass a resolver to create a workspace") |
|
23
|
|
|
if mets_url is None: |
|
24
|
|
|
raise Exception("Need to pass mets_url to create a workspace") |
|
25
|
|
|
workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir) |
|
26
|
|
|
return workspace |
|
27
|
|
|
|
|
28
|
|
|
def run_processor( |
|
29
|
|
|
processorClass, |
|
30
|
|
|
ocrd_tool=None, |
|
31
|
|
|
mets_url=None, |
|
32
|
|
|
resolver=None, |
|
33
|
|
|
workspace=None, |
|
34
|
|
|
page_id=None, |
|
35
|
|
|
log_level=None, # TODO actually use this! |
|
36
|
|
|
input_file_grp=None, |
|
37
|
|
|
output_file_grp=None, |
|
38
|
|
|
parameter=None, |
|
39
|
|
|
parameter_override=None, |
|
40
|
|
|
working_dir=None, |
|
41
|
|
|
): # pylint: disable=too-many-locals |
|
42
|
|
|
""" |
|
43
|
|
|
Create a workspace for mets_url and run processor through it |
|
44
|
|
|
|
|
45
|
|
|
Args: |
|
46
|
|
|
parameter (string): URL to the parameter |
|
47
|
|
|
""" |
|
48
|
|
|
workspace = _get_workspace( |
|
49
|
|
|
workspace, |
|
50
|
|
|
resolver, |
|
51
|
|
|
mets_url, |
|
52
|
|
|
working_dir |
|
53
|
|
|
) |
|
54
|
|
|
log.debug("Running processor %s", processorClass) |
|
55
|
|
|
processor = processorClass( |
|
56
|
|
|
workspace, |
|
57
|
|
|
ocrd_tool=ocrd_tool, |
|
58
|
|
|
page_id=page_id, |
|
59
|
|
|
input_file_grp=input_file_grp, |
|
60
|
|
|
output_file_grp=output_file_grp, |
|
61
|
|
|
parameter=parameter |
|
62
|
|
|
) |
|
63
|
|
|
ocrd_tool = processor.ocrd_tool |
|
64
|
|
|
name = '%s v%s' % (ocrd_tool['executable'], processor.version) |
|
65
|
|
|
otherrole = ocrd_tool['steps'][0] |
|
66
|
|
|
logProfile = getLogger('ocrd.process.profile') |
|
67
|
|
|
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) |
|
68
|
|
|
t0 = time() |
|
69
|
|
|
processor.process() |
|
70
|
|
|
t1 = time() - t0 |
|
71
|
|
|
logProfile.info("Executing processor '%s' took %fs [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % ( |
|
72
|
|
|
ocrd_tool['executable'], |
|
73
|
|
|
t1, |
|
74
|
|
|
input_file_grp if input_file_grp else '', |
|
75
|
|
|
output_file_grp if output_file_grp else '', |
|
76
|
|
|
json.dumps(parameter) if parameter else {} |
|
77
|
|
|
)) |
|
78
|
|
|
workspace.mets.add_agent( |
|
79
|
|
|
name=name, |
|
80
|
|
|
_type='OTHER', |
|
81
|
|
|
othertype='SOFTWARE', |
|
82
|
|
|
role='OTHER', |
|
83
|
|
|
otherrole=otherrole |
|
84
|
|
|
) |
|
85
|
|
|
workspace.save_mets() |
|
86
|
|
|
return processor |
|
87
|
|
|
|
|
88
|
|
|
def run_cli( |
|
89
|
|
|
executable, |
|
90
|
|
|
mets_url=None, |
|
91
|
|
|
resolver=None, |
|
92
|
|
|
workspace=None, |
|
93
|
|
|
page_id=None, |
|
94
|
|
|
overwrite=None, |
|
95
|
|
|
log_level=None, |
|
96
|
|
|
input_file_grp=None, |
|
97
|
|
|
output_file_grp=None, |
|
98
|
|
|
parameter=None, |
|
99
|
|
|
working_dir=None, |
|
100
|
|
|
): |
|
101
|
|
|
""" |
|
102
|
|
|
Create a workspace for mets_url and run MP CLI through it |
|
103
|
|
|
""" |
|
104
|
|
|
workspace = _get_workspace(workspace, resolver, mets_url, working_dir) |
|
105
|
|
|
args = [executable, '--working-dir', workspace.directory] |
|
106
|
|
|
args += ['--mets', mets_url] |
|
107
|
|
|
if log_level: |
|
108
|
|
|
args += ['--log-level', log_level] |
|
109
|
|
|
if page_id: |
|
110
|
|
|
args += ['--page-id', page_id] |
|
111
|
|
|
if input_file_grp: |
|
112
|
|
|
args += ['--input-file-grp', input_file_grp] |
|
113
|
|
|
if output_file_grp: |
|
114
|
|
|
args += ['--output-file-grp', output_file_grp] |
|
115
|
|
|
if parameter: |
|
116
|
|
|
args += ['--parameter', parameter] |
|
117
|
|
|
if overwrite: |
|
118
|
|
|
args += ['--overwrite'] |
|
119
|
|
|
log.debug("Running subprocess '%s'", ' '.join(args)) |
|
120
|
|
|
return subprocess.call(args) |
|
121
|
|
|
|
|
122
|
|
|
def generate_processor_help(ocrd_tool): |
|
123
|
|
|
parameter_help = '' |
|
124
|
|
|
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']: |
|
125
|
|
|
parameter_help = ' NONE\n' |
|
126
|
|
|
else: |
|
127
|
|
|
def wrap(s): |
|
128
|
|
|
return wrap_text(s, initial_indent=' '*3, |
|
129
|
|
|
subsequent_indent=' '*4, |
|
130
|
|
|
width=72, preserve_paragraphs=True) |
|
131
|
|
|
for param_name, param in ocrd_tool['parameters'].items(): |
|
132
|
|
|
parameter_help += wrap('"%s" [%s%s]' % ( |
|
133
|
|
|
param_name, |
|
134
|
|
|
param['type'], |
|
135
|
|
|
' - REQUIRED' if 'required' in param and param['required'] else |
|
136
|
|
|
' - %s' % json.dumps(param['default']) if 'default' in param else '')) |
|
137
|
|
|
parameter_help += '\n ' + wrap(param['description']) |
|
138
|
|
|
if 'enum' in param: |
|
139
|
|
|
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum'])) |
|
140
|
|
|
parameter_help += "\n" |
|
141
|
|
|
return ''' |
|
142
|
|
|
Usage: %s [OPTIONS] |
|
143
|
|
|
|
|
144
|
|
|
%s |
|
145
|
|
|
|
|
146
|
|
|
Options: |
|
147
|
|
|
-I, --input-file-grp USE File group(s) used as input |
|
148
|
|
|
-O, --output-file-grp USE File group(s) used as output |
|
149
|
|
|
-g, --page-id ID Physical page ID(s) to process |
|
150
|
|
|
--overwrite Remove existing output pages/images |
|
151
|
|
|
(with --page-id, remove only those) |
|
152
|
|
|
-p, --parameter JSON-PATH Parameters, either verbatim JSON string |
|
153
|
|
|
or JSON file path |
|
154
|
|
|
-P, --param-override KEY VAL Override a single JSON object key-value pair, |
|
155
|
|
|
taking precedence over --parameter |
|
156
|
|
|
-m, --mets URL-PATH URL or file path of METS to process |
|
157
|
|
|
-w, --working-dir PATH Working directory of local workspace |
|
158
|
|
|
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] |
|
159
|
|
|
Log level |
|
160
|
|
|
-J, --dump-json Dump tool description as JSON and exit |
|
161
|
|
|
-h, --help This help message |
|
162
|
|
|
-V, --version Show version |
|
163
|
|
|
|
|
164
|
|
|
Parameters: |
|
165
|
|
|
%s |
|
166
|
|
|
Default Wiring: |
|
167
|
|
|
%s -> %s |
|
168
|
|
|
|
|
169
|
|
|
''' % ( |
|
170
|
|
|
ocrd_tool['executable'], |
|
171
|
|
|
ocrd_tool['description'], |
|
172
|
|
|
parameter_help, |
|
173
|
|
|
ocrd_tool.get('input_file_grp', 'NONE'), |
|
174
|
|
|
ocrd_tool.get('output_file_grp', 'NONE') |
|
175
|
|
|
) |
|
176
|
|
|
|
|
177
|
|
|
|
|
178
|
|
|
|
|
179
|
|
|
|