|
1
|
|
|
from os.path import isfile |
|
2
|
|
|
from re import match, sub, IGNORECASE |
|
3
|
|
|
from itertools import product |
|
4
|
|
|
import sys |
|
5
|
|
|
from string import Template |
|
6
|
|
|
|
|
7
|
|
|
import click |
|
8
|
|
|
|
|
9
|
|
|
from ocrd_utils import ( |
|
10
|
|
|
is_local_filename, |
|
11
|
|
|
get_local_filename, |
|
12
|
|
|
setOverrideLogLevel, |
|
13
|
|
|
parse_json_string_or_file, |
|
14
|
|
|
set_json_key_value_overrides, |
|
15
|
|
|
) |
|
16
|
|
|
|
|
17
|
|
|
from ocrd_utils import getLogger |
|
18
|
|
|
from .resolver import Resolver |
|
19
|
|
|
from .processor.base import run_processor |
|
20
|
|
|
from ocrd_validators import WorkspaceValidator |
|
21
|
|
|
from ocrd_models.ocrd_mets_filter import FIELDS |
|
22
|
|
|
|
|
23
|
|
|
def _set_root_logger_version(ctx, param, value): # pylint: disable=unused-argument |
|
24
|
|
|
setOverrideLogLevel(value) |
|
25
|
|
|
return value |
|
26
|
|
|
|
|
27
|
|
|
loglevel_option = click.option('-l', '--log-level', help="Log level", |
|
28
|
|
|
type=click.Choice(['OFF', 'ERROR', 'WARN', 'INFO', 'DEBUG', 'TRACE']), |
|
29
|
|
|
default=None, callback=_set_root_logger_version) |
|
30
|
|
|
|
|
31
|
|
|
def _handle_param_option(ctx, param, value): |
|
32
|
|
|
return parse_json_string_or_file(*list(value)) |
|
33
|
|
|
|
|
34
|
|
|
parameter_option = click.option('-p', '--parameter', |
|
35
|
|
|
help="Parameters, either JSON string or path to JSON file", |
|
36
|
|
|
multiple=True, |
|
37
|
|
|
default=['{}'], |
|
38
|
|
|
callback=_handle_param_option) |
|
39
|
|
|
|
|
40
|
|
|
parameter_override_option = click.option('-P', '--parameter-override', |
|
41
|
|
|
help="Parameter override", |
|
42
|
|
|
nargs=2, |
|
43
|
|
|
multiple=True, |
|
44
|
|
|
callback=lambda ctx, param, kv: kv) |
|
45
|
|
|
# callback=lambda ctx, param, kv: {kv[0]: kv[1]}) |
|
46
|
|
|
|
|
47
|
|
|
def ocrd_cli_wrap_processor( |
|
48
|
|
|
processorClass, |
|
49
|
|
|
ocrd_tool=None, |
|
50
|
|
|
mets=None, |
|
51
|
|
|
working_dir=None, |
|
52
|
|
|
dump_json=False, |
|
53
|
|
|
help=False, # pylint: disable=redefined-builtin |
|
54
|
|
|
version=False, |
|
55
|
|
|
overwrite=False, |
|
56
|
|
|
**kwargs |
|
57
|
|
|
): |
|
58
|
|
|
if dump_json or help or version: |
|
59
|
|
|
setOverrideLogLevel('OFF', silent=True) |
|
60
|
|
|
processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) |
|
61
|
|
|
sys.exit() |
|
62
|
|
|
else: |
|
63
|
|
|
LOG = getLogger('ocrd_cli_wrap_processor') |
|
64
|
|
|
if not mets or (is_local_filename(mets) and not isfile(get_local_filename(mets))): |
|
65
|
|
|
processorClass(workspace=None, show_help=True) |
|
66
|
|
|
sys.exit(1) |
|
67
|
|
|
# LOG.info('kwargs=%s' % kwargs) |
|
68
|
|
|
# Merge parameter overrides and parameters |
|
69
|
|
|
if 'parameter_override' in kwargs: |
|
70
|
|
|
set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) |
|
71
|
|
|
# TODO OCR-D/core#274 |
|
72
|
|
|
# Assert -I / -O |
|
73
|
|
|
# if not kwargs['input_file_grp']: |
|
74
|
|
|
# raise ValueError('-I/--input-file-grp is required') |
|
75
|
|
|
# if not kwargs['output_file_grp']: |
|
76
|
|
|
# raise ValueError('-O/--output-file-grp is required') |
|
77
|
|
|
if is_local_filename(mets) and not isfile(get_local_filename(mets)): |
|
78
|
|
|
msg = "File does not exist: %s" % mets |
|
79
|
|
|
LOG.error(msg) |
|
80
|
|
|
raise Exception(msg) |
|
81
|
|
|
resolver = Resolver() |
|
82
|
|
|
workspace = resolver.workspace_from_url(mets, working_dir) |
|
83
|
|
|
page_id = kwargs.get('page_id') |
|
84
|
|
|
# XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 |
|
85
|
|
|
# if overwrite |
|
86
|
|
|
# if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: |
|
87
|
|
|
# raise Exception("--overwrite requires --output-file-grp") |
|
88
|
|
|
# LOG.info("Removing files because of --overwrite") |
|
89
|
|
|
# for grp in kwargs['output_file_grp'].split(','): |
|
90
|
|
|
# if page_id: |
|
91
|
|
|
# for one_page_id in kwargs['page_id'].split(','): |
|
92
|
|
|
# LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) |
|
93
|
|
|
# for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): |
|
94
|
|
|
# workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) |
|
95
|
|
|
# else: |
|
96
|
|
|
# LOG.debug("Removing all files in output file group %s ", grp) |
|
97
|
|
|
# # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) |
|
98
|
|
|
# workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) |
|
99
|
|
|
# workspace.save_mets() |
|
100
|
|
|
# XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace |
|
101
|
|
|
if overwrite: |
|
102
|
|
|
workspace.overwrite_mode = True |
|
103
|
|
|
report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) |
|
104
|
|
|
if not report.is_valid: |
|
105
|
|
|
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) |
|
106
|
|
|
run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs) |
|
107
|
|
|
|
|
108
|
|
|
def ocrd_loglevel(f): |
|
109
|
|
|
""" |
|
110
|
|
|
Add an option '--log-level' to set the log level. |
|
111
|
|
|
""" |
|
112
|
|
|
loglevel_option(f) |
|
113
|
|
|
return f |
|
114
|
|
|
|
|
115
|
|
|
def ocrd_cli_options(f): |
|
116
|
|
|
""" |
|
117
|
|
|
Implement MP CLI. |
|
118
|
|
|
|
|
119
|
|
|
Usage:: |
|
120
|
|
|
|
|
121
|
|
|
import ocrd_cli_options from ocrd.utils |
|
122
|
|
|
|
|
123
|
|
|
@click.command() |
|
124
|
|
|
@ocrd_cli_options |
|
125
|
|
|
def cli(mets_url, **kwargs): |
|
126
|
|
|
print(mets_url) |
|
127
|
|
|
""" |
|
128
|
|
|
params = [ |
|
129
|
|
|
click.option('-m', '--mets', help="METS to process", default="mets.xml"), |
|
130
|
|
|
click.option('-w', '--working-dir', help="Working Directory"), |
|
131
|
|
|
# TODO OCR-D/core#274 |
|
132
|
|
|
# click.option('-I', '--input-file-grp', help='File group(s) used as input. **required**'), |
|
133
|
|
|
# click.option('-O', '--output-file-grp', help='File group(s) used as output. **required**'), |
|
134
|
|
|
click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT'), |
|
135
|
|
|
click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT'), |
|
136
|
|
|
click.option('-g', '--page-id', help="ID(s) of the pages to process"), |
|
137
|
|
|
click.option('--overwrite', help="Overwrite the output file group or a page range (--page-id)", is_flag=True, default=False), |
|
138
|
|
|
parameter_option, |
|
139
|
|
|
parameter_override_option, |
|
140
|
|
|
click.option('-J', '--dump-json', help="Dump tool description as JSON and exit", is_flag=True, default=False), |
|
141
|
|
|
loglevel_option, |
|
142
|
|
|
click.option('-V', '--version', help="Show version", is_flag=True, default=False), |
|
143
|
|
|
click.option('-h', '--help', help="This help message", is_flag=True, default=False), |
|
144
|
|
|
] |
|
145
|
|
|
for param in params: |
|
146
|
|
|
param(f) |
|
147
|
|
|
return f |
|
148
|
|
|
|
|
149
|
|
|
TEMPLATE_DEFAULTS = { |
|
150
|
|
|
'metavar': 'PAT', |
|
151
|
|
|
'required': False, |
|
152
|
|
|
'parameter': '${field}_${operator}clude', |
|
153
|
|
|
'help': '${field} ${operation} ${type}', |
|
154
|
|
|
'help_field': '${field}', |
|
155
|
|
|
'help_operation': 'to ${operator}clude', |
|
156
|
|
|
'help_type': '(string/regex/comma-separated)', |
|
157
|
|
|
} |
|
158
|
|
|
class ocrd_mets_filter_options(): |
|
159
|
|
|
""" |
|
160
|
|
|
Adds include/exclude filter options |
|
161
|
|
|
""" |
|
162
|
|
|
|
|
163
|
|
|
def __init__(self, fields=FIELDS, operators=None, **templates): |
|
164
|
|
|
self.fields = fields |
|
165
|
|
|
self.operators = operators if operators else ['ex', 'in'] |
|
166
|
|
|
templates={**TEMPLATE_DEFAULTS, **templates} |
|
167
|
|
|
self.templates = {} |
|
168
|
|
|
for (tpl_name, tpl), field, operator in product(templates.items(), self.fields, self.operators): |
|
169
|
|
|
if tpl_name not in self.templates: |
|
170
|
|
|
self.templates[tpl_name] = dict() |
|
171
|
|
|
key = field |
|
172
|
|
|
if tpl_name in ['help_operation']: |
|
173
|
|
|
key = '%sclude' % operator |
|
174
|
|
|
elif tpl_name in ['parameter', 'required']: |
|
175
|
|
|
key = '%s_%sclude' % (field, operator) |
|
176
|
|
|
if key not in self.templates[tpl_name]: |
|
177
|
|
|
if isinstance(tpl, dict): |
|
178
|
|
|
self.templates[tpl_name][key] = Template(str(tpl[key] if key in tpl else TEMPLATE_DEFAULTS[tpl_name])) |
|
179
|
|
|
else: |
|
180
|
|
|
self.templates[tpl_name][key] = Template(str(tpl if tpl else TEMPLATE_DEFAULTS[tpl_name])) |
|
181
|
|
|
|
|
182
|
|
|
def _expand_template(self, tpl_name, field, operator, tpl_vars): |
|
183
|
|
|
tpl = self.templates[tpl_name] |
|
184
|
|
|
if tpl_name in ['help_operation']: |
|
185
|
|
|
return tpl['%sclude' % operator].safe_substitute(tpl_vars) |
|
186
|
|
|
if tpl_name in ['parameter']: |
|
187
|
|
|
return tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars) |
|
188
|
|
|
if tpl_name in ['required']: |
|
189
|
|
|
return 'True' == tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars) |
|
190
|
|
|
return tpl[field].safe_substitute(tpl_vars) |
|
191
|
|
|
|
|
192
|
|
|
def __call__(self, f): |
|
193
|
|
|
for field, operator in product(self.fields, self.operators): |
|
194
|
|
|
_tpl = lambda tpl_name: lambda **tpl_vars_: self._expand_template(tpl_name, field, |
|
|
|
|
|
|
195
|
|
|
operator, tpl_vars={**{'field': field, 'operator': operator}, **tpl_vars_}) |
|
196
|
|
|
|
|
197
|
|
|
# XXX Controls the kwarg name of this field in the decorated command |
|
198
|
|
|
args = [_tpl('parameter')()] |
|
199
|
|
|
kwargs = dict( |
|
200
|
|
|
default=None, |
|
201
|
|
|
callback=lambda ctx, param, value: value.split(',') if value and ',' in value else value, |
|
202
|
|
|
required=_tpl('required')(), |
|
203
|
|
|
metavar=_tpl('metavar')(), |
|
204
|
|
|
help=_tpl('help')( |
|
205
|
|
|
field=_tpl('help_field')(), |
|
206
|
|
|
operation=_tpl('help_operation')(), |
|
207
|
|
|
type=_tpl('help_type')() |
|
208
|
|
|
)) |
|
209
|
|
|
|
|
210
|
|
|
# XXX No regex search for pageId search currently |
|
211
|
|
|
if field == 'pageId' and operator == 'in': |
|
212
|
|
|
kwargs['help'] = sub(r'[,/]?\s*regexp?\b', '', kwargs['help'], flags=IGNORECASE) |
|
213
|
|
|
|
|
214
|
|
|
# pylint: disable=multiple-statements |
|
215
|
|
|
# XXX must be retained for backwards-compatibility |
|
216
|
|
|
if operator == 'in': |
|
217
|
|
|
if field == 'ID': args.extend(['-i', '--file-id']) |
|
218
|
|
|
if field == 'pageId': args.extend(['-g', '--page-id']) |
|
219
|
|
|
if field == 'fileGrp': args.extend(['-G', '--file-grp']) |
|
220
|
|
|
if field == 'mimetype': args.extend(['-m', '--mimetype']) |
|
221
|
|
|
|
|
222
|
|
|
# # 0 |
|
223
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
|
224
|
|
|
# if field.lower() != field: |
|
225
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower())) |
|
226
|
|
|
|
|
227
|
|
|
# 2 |
|
228
|
|
|
args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower())) |
|
229
|
|
|
|
|
230
|
|
|
# 3 |
|
231
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
|
232
|
|
|
|
|
233
|
|
|
# 4 |
|
234
|
|
|
# if operator == 'in': |
|
235
|
|
|
# args.append('--%s' % field.lower()) |
|
236
|
|
|
# else: |
|
237
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
|
238
|
|
|
|
|
239
|
|
|
click.option(*args, **kwargs)(f) |
|
240
|
|
|
# print({k: v.safe_substitute({}) for k, v in self.templates['required'].items()}) |
|
241
|
|
|
return f |
|
242
|
|
|
|