1
|
|
|
import sys |
2
|
|
|
from contextlib import nullcontext |
3
|
|
|
|
4
|
|
|
from ocrd_utils import ( |
5
|
|
|
config, |
6
|
|
|
initLogging, |
7
|
|
|
is_local_filename, |
8
|
|
|
get_local_filename, |
9
|
|
|
getLogger, |
10
|
|
|
parse_json_string_with_comments, |
11
|
|
|
set_json_key_value_overrides, |
12
|
|
|
parse_json_string_or_file, |
13
|
|
|
redirect_stderr_and_stdout_to_file, |
14
|
|
|
) |
15
|
|
|
from ocrd_validators import WorkspaceValidator |
16
|
|
|
|
17
|
|
|
from ..resolver import Resolver |
18
|
|
|
from ..processor.base import ResourceNotFoundError, run_processor |
19
|
|
|
|
20
|
|
|
from .loglevel_option import ocrd_loglevel |
21
|
|
|
from .parameter_option import parameter_option, parameter_override_option |
22
|
|
|
from .ocrd_cli_options import ocrd_cli_options |
23
|
|
|
from .mets_find_options import mets_find_options |
24
|
|
|
|
25
|
|
|
|
26
|
|
|
def ocrd_cli_wrap_processor( |
27
|
|
|
processorClass, |
28
|
|
|
mets=None, |
29
|
|
|
mets_server_url=None, |
30
|
|
|
working_dir=None, |
31
|
|
|
dump_json=False, |
32
|
|
|
dump_module_dir=False, |
33
|
|
|
help=False, # pylint: disable=redefined-builtin |
34
|
|
|
profile=False, |
35
|
|
|
profile_file=None, |
36
|
|
|
version=False, |
37
|
|
|
overwrite=False, |
38
|
|
|
debug=False, |
39
|
|
|
resolve_resource=None, |
40
|
|
|
show_resource=None, |
41
|
|
|
list_resources=False, |
42
|
|
|
# ocrd_network params start # |
43
|
|
|
subcommand=None, |
44
|
|
|
address=None, |
45
|
|
|
queue=None, |
46
|
|
|
log_filename=None, |
47
|
|
|
database=None, |
48
|
|
|
# ocrd_network params end # |
49
|
|
|
**kwargs |
50
|
|
|
): |
51
|
|
|
# init logging handlers so no imported libs can preempt ours |
52
|
|
|
initLogging() |
53
|
|
|
|
54
|
|
|
# FIXME: remove workspace arg entirely |
55
|
|
|
processor = processorClass(None) |
56
|
|
|
if not sys.argv[1:]: |
57
|
|
|
processor.show_help(subcommand=subcommand) |
58
|
|
|
sys.exit(1) |
59
|
|
|
if help: |
60
|
|
|
processor.show_help(subcommand=subcommand) |
61
|
|
|
sys.exit() |
62
|
|
|
if version: |
63
|
|
|
processor.show_version() |
64
|
|
|
sys.exit() |
65
|
|
|
if dump_json: |
66
|
|
|
processor.dump_json() |
67
|
|
|
sys.exit() |
68
|
|
|
if dump_module_dir: |
69
|
|
|
processor.dump_module_dir() |
70
|
|
|
sys.exit() |
71
|
|
|
if resolve_resource: |
72
|
|
|
try: |
73
|
|
|
res = processor.resolve_resource(resolve_resource) |
74
|
|
|
print(res) |
75
|
|
|
sys.exit() |
76
|
|
|
except ResourceNotFoundError as e: |
77
|
|
|
log = getLogger('ocrd.processor.base') |
78
|
|
|
log.critical(e.message) |
79
|
|
|
sys.exit(1) |
80
|
|
|
if show_resource: |
81
|
|
|
try: |
82
|
|
|
processor.show_resource(show_resource) |
83
|
|
|
sys.exit() |
84
|
|
|
except ResourceNotFoundError as e: |
85
|
|
|
log = getLogger('ocrd.processor.base') |
86
|
|
|
log.critical(e.message) |
87
|
|
|
sys.exit(1) |
88
|
|
|
if list_resources: |
89
|
|
|
processor.list_resources() |
90
|
|
|
sys.exit() |
91
|
|
|
if subcommand or address or queue or database: |
92
|
|
|
# Used for checking/starting network agents for the WebAPI architecture |
93
|
|
|
check_and_run_network_agent(processorClass, subcommand, address, database, queue) |
94
|
|
|
|
95
|
|
|
if 'parameter' in kwargs: |
96
|
|
|
# Disambiguate parameter file/literal, and resolve file |
97
|
|
|
def resolve(name): |
98
|
|
|
try: |
99
|
|
|
return processor.resolve_resource(name) |
100
|
|
|
except ResourceNotFoundError: |
101
|
|
|
return None |
102
|
|
|
kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], |
103
|
|
|
resolve_preset_file=resolve) |
104
|
|
|
else: |
105
|
|
|
kwargs['parameter'] = {} |
106
|
|
|
# Merge parameter overrides and parameters |
107
|
|
|
if 'parameter_override' in kwargs: |
108
|
|
|
set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) |
109
|
|
|
# Assert -I / -O |
110
|
|
|
if not kwargs['input_file_grp']: |
111
|
|
|
raise ValueError('-I/--input-file-grp is required') |
112
|
|
|
if not kwargs['output_file_grp']: |
113
|
|
|
raise ValueError('-O/--output-file-grp is required') |
114
|
|
|
resolver = Resolver() |
115
|
|
|
working_dir, mets, _, mets_server_url = \ |
116
|
|
|
resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) |
117
|
|
|
workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) |
118
|
|
|
page_id = kwargs.get('page_id') |
119
|
|
|
if debug: |
120
|
|
|
config.OCRD_MISSING_INPUT = 'ABORT' |
121
|
|
|
config.OCRD_MISSING_OUTPUT = 'ABORT' |
122
|
|
|
config.OCRD_EXISTING_OUTPUT = 'ABORT' |
123
|
|
|
if overwrite: |
124
|
|
|
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' |
125
|
|
|
report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) |
126
|
|
|
if not report.is_valid: |
127
|
|
|
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) |
128
|
|
|
# Set up profiling behavior from environment variables/flags |
129
|
|
|
if not profile and 'CPU' in config.OCRD_PROFILE: |
130
|
|
|
profile = True |
131
|
|
|
if not profile_file and config.is_set('OCRD_PROFILE_FILE'): |
132
|
|
|
profile_file = config.OCRD_PROFILE_FILE |
133
|
|
|
if profile or profile_file: |
134
|
|
|
import cProfile |
135
|
|
|
import pstats |
136
|
|
|
import io |
137
|
|
|
import atexit |
138
|
|
|
print("Profiling...") |
139
|
|
|
pr = cProfile.Profile() |
140
|
|
|
pr.enable() |
141
|
|
|
def goexit(): |
142
|
|
|
pr.disable() |
|
|
|
|
143
|
|
|
print("Profiling completed") |
144
|
|
|
if profile_file: |
145
|
|
|
pr.dump_stats(profile_file) |
146
|
|
|
s = io.StringIO() |
|
|
|
|
147
|
|
|
pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() |
|
|
|
|
148
|
|
|
print(s.getvalue()) |
149
|
|
|
atexit.register(goexit) |
150
|
|
|
if log_filename: |
151
|
|
|
log_ctx = redirect_stderr_and_stdout_to_file(log_filename) |
152
|
|
|
else: |
153
|
|
|
log_ctx = nullcontext() |
154
|
|
|
with log_ctx: |
155
|
|
|
run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) |
156
|
|
|
|
157
|
|
|
|
158
|
|
|
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): |
159
|
|
|
""" |
160
|
|
|
""" |
161
|
|
|
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType |
162
|
|
|
SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] |
163
|
|
|
|
164
|
|
|
if not subcommand: |
165
|
|
|
raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") |
166
|
|
|
if subcommand not in SUBCOMMANDS: |
167
|
|
|
raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") |
168
|
|
|
|
169
|
|
|
if not database: |
170
|
|
|
raise ValueError(f"Option '--database' is invalid for subcommand {subcommand}") |
171
|
|
|
|
172
|
|
|
if subcommand == AgentType.PROCESSOR_SERVER: |
173
|
|
|
if not address: |
174
|
|
|
raise ValueError(f"Option '--address' required for subcommand {subcommand}") |
175
|
|
|
if queue: |
176
|
|
|
raise ValueError(f"Option '--queue' invalid for subcommand {subcommand}") |
177
|
|
|
if subcommand == AgentType.PROCESSING_WORKER: |
178
|
|
|
if address: |
179
|
|
|
raise ValueError(f"Option '--address' invalid for subcommand {subcommand}") |
180
|
|
|
if not queue: |
181
|
|
|
raise ValueError(f"Option '--queue' required for subcommand {subcommand}") |
182
|
|
|
|
183
|
|
|
processor = ProcessorClass(workspace=None) |
184
|
|
|
if subcommand == AgentType.PROCESSING_WORKER: |
185
|
|
|
processing_worker = ProcessingWorker( |
186
|
|
|
rabbitmq_addr=queue, |
187
|
|
|
mongodb_addr=database, |
188
|
|
|
processor_name=processor.ocrd_tool['executable'], |
189
|
|
|
ocrd_tool=processor.ocrd_tool, |
190
|
|
|
processor_class=ProcessorClass, |
191
|
|
|
) |
192
|
|
|
# The RMQConsumer is initialized and a connection to the RabbitMQ is performed |
193
|
|
|
processing_worker.connect_consumer() |
194
|
|
|
# Start consuming from the queue with name `processor_name` |
195
|
|
|
processing_worker.start_consuming() |
196
|
|
|
elif subcommand == AgentType.PROCESSOR_SERVER: |
197
|
|
|
# TODO: Better validate that inside the ProcessorServer itself |
198
|
|
|
host, port = address.split(':') |
199
|
|
|
processor_server = ProcessorServer( |
200
|
|
|
mongodb_addr=database, |
201
|
|
|
processor_name=processor.ocrd_tool['executable'], |
202
|
|
|
processor_class=ProcessorClass, |
203
|
|
|
) |
204
|
|
|
processor_server.run_server(host=host, port=int(port)) |
205
|
|
|
else: |
206
|
|
|
raise ValueError(f"Unknown network agent type, must be one of: {SUBCOMMANDS}") |
207
|
|
|
sys.exit(0) |
208
|
|
|
|