1
|
|
|
from os.path import isfile |
2
|
|
|
from re import match, sub, IGNORECASE |
3
|
|
|
from itertools import product |
4
|
|
|
import sys |
5
|
|
|
from string import Template |
6
|
|
|
|
7
|
|
|
import click |
8
|
|
|
|
9
|
|
|
from ocrd_utils import ( |
10
|
|
|
is_local_filename, |
11
|
|
|
get_local_filename, |
12
|
|
|
setOverrideLogLevel, |
13
|
|
|
parse_json_string_or_file, |
14
|
|
|
set_json_key_value_overrides, |
15
|
|
|
) |
16
|
|
|
|
17
|
|
|
from ocrd_utils import getLogger |
18
|
|
|
from .resolver import Resolver |
19
|
|
|
from .processor.base import run_processor |
20
|
|
|
from ocrd_validators import WorkspaceValidator |
21
|
|
|
from ocrd_models.ocrd_mets_filter import FIELDS |
22
|
|
|
|
23
|
|
|
def _set_root_logger_version(ctx, param, value): # pylint: disable=unused-argument |
24
|
|
|
setOverrideLogLevel(value) |
25
|
|
|
return value |
26
|
|
|
|
27
|
|
|
loglevel_option = click.option('-l', '--log-level', help="Log level", |
28
|
|
|
type=click.Choice(['OFF', 'ERROR', 'WARN', 'INFO', 'DEBUG', 'TRACE']), |
29
|
|
|
default=None, callback=_set_root_logger_version) |
30
|
|
|
|
31
|
|
|
def _handle_param_option(ctx, param, value): |
32
|
|
|
return parse_json_string_or_file(*list(value)) |
33
|
|
|
|
34
|
|
|
parameter_option = click.option('-p', '--parameter', |
35
|
|
|
help="Parameters, either JSON string or path to JSON file", |
36
|
|
|
multiple=True, |
37
|
|
|
default=['{}'], |
38
|
|
|
callback=_handle_param_option) |
39
|
|
|
|
40
|
|
|
parameter_override_option = click.option('-P', '--parameter-override', |
41
|
|
|
help="Parameter override", |
42
|
|
|
nargs=2, |
43
|
|
|
multiple=True, |
44
|
|
|
callback=lambda ctx, param, kv: kv) |
45
|
|
|
# callback=lambda ctx, param, kv: {kv[0]: kv[1]}) |
46
|
|
|
|
47
|
|
|
def ocrd_cli_wrap_processor( |
48
|
|
|
processorClass, |
49
|
|
|
ocrd_tool=None, |
50
|
|
|
mets=None, |
51
|
|
|
working_dir=None, |
52
|
|
|
dump_json=False, |
53
|
|
|
help=False, # pylint: disable=redefined-builtin |
54
|
|
|
version=False, |
55
|
|
|
overwrite=False, |
56
|
|
|
**kwargs |
57
|
|
|
): |
58
|
|
|
if dump_json or help or version: |
59
|
|
|
setOverrideLogLevel('OFF', silent=True) |
60
|
|
|
processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) |
61
|
|
|
sys.exit() |
62
|
|
|
else: |
63
|
|
|
LOG = getLogger('ocrd_cli_wrap_processor') |
64
|
|
|
if not mets or (is_local_filename(mets) and not isfile(get_local_filename(mets))): |
65
|
|
|
processorClass(workspace=None, show_help=True) |
66
|
|
|
sys.exit(1) |
67
|
|
|
# LOG.info('kwargs=%s' % kwargs) |
68
|
|
|
# Merge parameter overrides and parameters |
69
|
|
|
if 'parameter_override' in kwargs: |
70
|
|
|
set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) |
71
|
|
|
# TODO OCR-D/core#274 |
72
|
|
|
# Assert -I / -O |
73
|
|
|
# if not kwargs['input_file_grp']: |
74
|
|
|
# raise ValueError('-I/--input-file-grp is required') |
75
|
|
|
# if not kwargs['output_file_grp']: |
76
|
|
|
# raise ValueError('-O/--output-file-grp is required') |
77
|
|
|
if is_local_filename(mets) and not isfile(get_local_filename(mets)): |
78
|
|
|
msg = "File does not exist: %s" % mets |
79
|
|
|
LOG.error(msg) |
80
|
|
|
raise Exception(msg) |
81
|
|
|
resolver = Resolver() |
82
|
|
|
workspace = resolver.workspace_from_url(mets, working_dir) |
83
|
|
|
page_id = kwargs.get('page_id') |
84
|
|
|
# XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 |
85
|
|
|
# if overwrite |
86
|
|
|
# if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: |
87
|
|
|
# raise Exception("--overwrite requires --output-file-grp") |
88
|
|
|
# LOG.info("Removing files because of --overwrite") |
89
|
|
|
# for grp in kwargs['output_file_grp'].split(','): |
90
|
|
|
# if page_id: |
91
|
|
|
# for one_page_id in kwargs['page_id'].split(','): |
92
|
|
|
# LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) |
93
|
|
|
# for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): |
94
|
|
|
# workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) |
95
|
|
|
# else: |
96
|
|
|
# LOG.debug("Removing all files in output file group %s ", grp) |
97
|
|
|
# # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) |
98
|
|
|
# workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) |
99
|
|
|
# workspace.save_mets() |
100
|
|
|
# XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace |
101
|
|
|
if overwrite: |
102
|
|
|
workspace.overwrite_mode = True |
103
|
|
|
report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) |
104
|
|
|
if not report.is_valid: |
105
|
|
|
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) |
106
|
|
|
run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs) |
107
|
|
|
|
108
|
|
|
def ocrd_loglevel(f): |
109
|
|
|
""" |
110
|
|
|
Add an option '--log-level' to set the log level. |
111
|
|
|
""" |
112
|
|
|
loglevel_option(f) |
113
|
|
|
return f |
114
|
|
|
|
115
|
|
|
def ocrd_cli_options(f): |
116
|
|
|
""" |
117
|
|
|
Implement MP CLI. |
118
|
|
|
|
119
|
|
|
Usage:: |
120
|
|
|
|
121
|
|
|
import ocrd_cli_options from ocrd.utils |
122
|
|
|
|
123
|
|
|
@click.command() |
124
|
|
|
@ocrd_cli_options |
125
|
|
|
def cli(mets_url, **kwargs): |
126
|
|
|
print(mets_url) |
127
|
|
|
""" |
128
|
|
|
params = [ |
129
|
|
|
click.option('-m', '--mets', help="METS to process", default="mets.xml"), |
130
|
|
|
click.option('-w', '--working-dir', help="Working Directory"), |
131
|
|
|
# TODO OCR-D/core#274 |
132
|
|
|
# click.option('-I', '--input-file-grp', help='File group(s) used as input. **required**'), |
133
|
|
|
# click.option('-O', '--output-file-grp', help='File group(s) used as output. **required**'), |
134
|
|
|
click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT'), |
135
|
|
|
click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT'), |
136
|
|
|
click.option('-g', '--page-id', help="ID(s) of the pages to process"), |
137
|
|
|
click.option('--overwrite', help="Overwrite the output file group or a page range (--page-id)", is_flag=True, default=False), |
138
|
|
|
parameter_option, |
139
|
|
|
parameter_override_option, |
140
|
|
|
click.option('-J', '--dump-json', help="Dump tool description as JSON and exit", is_flag=True, default=False), |
141
|
|
|
loglevel_option, |
142
|
|
|
click.option('-V', '--version', help="Show version", is_flag=True, default=False), |
143
|
|
|
click.option('-h', '--help', help="This help message", is_flag=True, default=False), |
144
|
|
|
] |
145
|
|
|
for param in params: |
146
|
|
|
param(f) |
147
|
|
|
return f |
148
|
|
|
|
149
|
|
|
TEMPLATE_DEFAULTS = { |
150
|
|
|
'metavar': 'PAT', |
151
|
|
|
'required': False, |
152
|
|
|
'parameter': '${field}_${operator}clude', |
153
|
|
|
'help': '${field} ${operation} ${type}', |
154
|
|
|
'help_field': '${field}', |
155
|
|
|
'help_operation': 'to ${operator}clude', |
156
|
|
|
'help_type': '(string/regex/comma-separated)', |
157
|
|
|
} |
158
|
|
|
class ocrd_mets_filter_options(): |
159
|
|
|
""" |
160
|
|
|
Adds include/exclude filter options |
161
|
|
|
""" |
162
|
|
|
|
163
|
|
|
def __init__(self, fields=FIELDS, operators=None, **templates): |
164
|
|
|
self.fields = fields |
165
|
|
|
self.operators = operators if operators else ['ex', 'in'] |
166
|
|
|
templates={**TEMPLATE_DEFAULTS, **templates} |
167
|
|
|
self.templates = {} |
168
|
|
|
for (tpl_name, tpl), field, operator in product(templates.items(), self.fields, self.operators): |
169
|
|
|
if tpl_name not in self.templates: |
170
|
|
|
self.templates[tpl_name] = dict() |
171
|
|
|
key = field |
172
|
|
|
if tpl_name in ['help_operation']: |
173
|
|
|
key = '%sclude' % operator |
174
|
|
|
elif tpl_name in ['parameter', 'required']: |
175
|
|
|
key = '%s_%sclude' % (field, operator) |
176
|
|
|
if key not in self.templates[tpl_name]: |
177
|
|
|
if isinstance(tpl, dict): |
178
|
|
|
self.templates[tpl_name][key] = Template(str(tpl[key] if key in tpl else TEMPLATE_DEFAULTS[tpl_name])) |
179
|
|
|
else: |
180
|
|
|
self.templates[tpl_name][key] = Template(str(tpl if tpl else TEMPLATE_DEFAULTS[tpl_name])) |
181
|
|
|
|
182
|
|
|
def _expand_template(self, tpl_name, field, operator, tpl_vars): |
183
|
|
|
tpl = self.templates[tpl_name] |
184
|
|
|
if tpl_name in ['help_operation']: |
185
|
|
|
return tpl['%sclude' % operator].safe_substitute(tpl_vars) |
186
|
|
|
if tpl_name in ['parameter']: |
187
|
|
|
return tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars) |
188
|
|
|
if tpl_name in ['required']: |
189
|
|
|
return 'True' == tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars) |
190
|
|
|
return tpl[field].safe_substitute(tpl_vars) |
191
|
|
|
|
192
|
|
|
def __call__(self, f): |
193
|
|
|
for field, operator in product(self.fields, self.operators): |
194
|
|
|
_tpl = lambda tpl_name: lambda **tpl_vars_: self._expand_template(tpl_name, field, |
|
|
|
|
195
|
|
|
operator, tpl_vars={**{'field': field, 'operator': operator}, **tpl_vars_}) |
196
|
|
|
|
197
|
|
|
# XXX Controls the kwarg name of this field in the decorated command |
198
|
|
|
args = [_tpl('parameter')()] |
199
|
|
|
kwargs = dict( |
200
|
|
|
default=None, |
201
|
|
|
callback=lambda ctx, param, value: value.split(',') if value and ',' in value else value, |
202
|
|
|
required=_tpl('required')(), |
203
|
|
|
metavar=_tpl('metavar')(), |
204
|
|
|
help=_tpl('help')( |
205
|
|
|
field=_tpl('help_field')(), |
206
|
|
|
operation=_tpl('help_operation')(), |
207
|
|
|
type=_tpl('help_type')() |
208
|
|
|
)) |
209
|
|
|
|
210
|
|
|
# XXX No regex search for pageId search currently |
211
|
|
|
if field == 'pageId' and operator == 'in': |
212
|
|
|
kwargs['help'] = sub(r'[,/]?\s*regexp?\b', '', kwargs['help'], flags=IGNORECASE) |
213
|
|
|
|
214
|
|
|
# pylint: disable=multiple-statements |
215
|
|
|
# XXX must be retained for backwards-compatibility |
216
|
|
|
if operator == 'in': |
217
|
|
|
if field == 'ID': args.extend(['-i', '--file-id']) |
218
|
|
|
if field == 'pageId': args.extend(['-g', '--page-id']) |
219
|
|
|
if field == 'fileGrp': args.extend(['-G', '--file-grp']) |
220
|
|
|
if field == 'mimetype': args.extend(['-m', '--mimetype']) |
221
|
|
|
|
222
|
|
|
# # 0 |
223
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
224
|
|
|
# if field.lower() != field: |
225
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower())) |
226
|
|
|
|
227
|
|
|
# 2 |
228
|
|
|
args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower())) |
229
|
|
|
|
230
|
|
|
# 3 |
231
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
232
|
|
|
|
233
|
|
|
# 4 |
234
|
|
|
# if operator == 'in': |
235
|
|
|
# args.append('--%s' % field.lower()) |
236
|
|
|
# else: |
237
|
|
|
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field)) |
238
|
|
|
|
239
|
|
|
click.option(*args, **kwargs)(f) |
240
|
|
|
# print({k: v.safe_substitute({}) for k, v in self.templates['required'].items()}) |
241
|
|
|
return f |
242
|
|
|
|