|
1
|
|
|
import sys |
|
2
|
|
|
from contextlib import nullcontext |
|
3
|
|
|
|
|
4
|
|
|
from ocrd_utils import ( |
|
5
|
|
|
config, |
|
6
|
|
|
initLogging, |
|
7
|
|
|
is_local_filename, |
|
8
|
|
|
get_local_filename, |
|
9
|
|
|
getLogger, |
|
10
|
|
|
parse_json_string_with_comments, |
|
11
|
|
|
set_json_key_value_overrides, |
|
12
|
|
|
parse_json_string_or_file, |
|
13
|
|
|
redirect_stderr_and_stdout_to_file, |
|
14
|
|
|
) |
|
15
|
|
|
from ocrd_validators import WorkspaceValidator |
|
16
|
|
|
|
|
17
|
|
|
from ..resolver import Resolver |
|
18
|
|
|
from ..processor.base import ResourceNotFoundError, run_processor |
|
19
|
|
|
|
|
20
|
|
|
from .loglevel_option import ocrd_loglevel |
|
21
|
|
|
from .parameter_option import parameter_option, parameter_override_option |
|
22
|
|
|
from .ocrd_cli_options import ocrd_cli_options |
|
23
|
|
|
from .mets_find_options import mets_find_options |
|
24
|
|
|
|
|
25
|
|
|
|
|
26
|
|
|
def ocrd_cli_wrap_processor( |
|
27
|
|
|
processorClass, |
|
28
|
|
|
mets=None, |
|
29
|
|
|
mets_server_url=None, |
|
30
|
|
|
working_dir=None, |
|
31
|
|
|
dump_json=False, |
|
32
|
|
|
dump_module_dir=False, |
|
33
|
|
|
help=False, # pylint: disable=redefined-builtin |
|
34
|
|
|
profile=False, |
|
35
|
|
|
profile_file=None, |
|
36
|
|
|
version=False, |
|
37
|
|
|
overwrite=False, |
|
38
|
|
|
debug=False, |
|
39
|
|
|
resolve_resource=None, |
|
40
|
|
|
show_resource=None, |
|
41
|
|
|
list_resources=False, |
|
42
|
|
|
# ocrd_network params start # |
|
43
|
|
|
subcommand=None, |
|
44
|
|
|
queue=None, |
|
45
|
|
|
log_filename=None, |
|
46
|
|
|
database=None, |
|
47
|
|
|
# ocrd_network params end # |
|
48
|
|
|
**kwargs |
|
49
|
|
|
): |
|
50
|
|
|
# init logging handlers so no imported libs can preempt ours |
|
51
|
|
|
initLogging() |
|
52
|
|
|
|
|
53
|
|
|
# FIXME: remove workspace arg entirely |
|
54
|
|
|
processor = processorClass(None) |
|
55
|
|
|
if not sys.argv[1:]: |
|
56
|
|
|
processor.show_help(subcommand=subcommand) |
|
57
|
|
|
sys.exit(1) |
|
58
|
|
|
if help: |
|
59
|
|
|
processor.show_help(subcommand=subcommand) |
|
60
|
|
|
sys.exit() |
|
61
|
|
|
if version: |
|
62
|
|
|
processor.show_version() |
|
63
|
|
|
sys.exit() |
|
64
|
|
|
if dump_json: |
|
65
|
|
|
processor.dump_json() |
|
66
|
|
|
sys.exit() |
|
67
|
|
|
if dump_module_dir: |
|
68
|
|
|
processor.dump_module_dir() |
|
69
|
|
|
sys.exit() |
|
70
|
|
|
if resolve_resource: |
|
71
|
|
|
try: |
|
72
|
|
|
res = processor.resolve_resource(resolve_resource) |
|
73
|
|
|
print(res) |
|
74
|
|
|
sys.exit() |
|
75
|
|
|
except ResourceNotFoundError as e: |
|
76
|
|
|
log = getLogger('ocrd.processor.base') |
|
77
|
|
|
log.critical(e.message) |
|
78
|
|
|
sys.exit(1) |
|
79
|
|
|
if show_resource: |
|
80
|
|
|
try: |
|
81
|
|
|
processor.show_resource(show_resource) |
|
82
|
|
|
sys.exit() |
|
83
|
|
|
except ResourceNotFoundError as e: |
|
84
|
|
|
log = getLogger('ocrd.processor.base') |
|
85
|
|
|
log.critical(e.message) |
|
86
|
|
|
sys.exit(1) |
|
87
|
|
|
if list_resources: |
|
88
|
|
|
processor.list_resources() |
|
89
|
|
|
sys.exit() |
|
90
|
|
|
if subcommand == "worker" or queue or database: |
|
91
|
|
|
check_and_run_processing_worker(processorClass, database, queue) |
|
92
|
|
|
|
|
93
|
|
|
if 'parameter' in kwargs: |
|
94
|
|
|
# Disambiguate parameter file/literal, and resolve file |
|
95
|
|
|
def resolve(name): |
|
96
|
|
|
try: |
|
97
|
|
|
return processor.resolve_resource(name) |
|
98
|
|
|
except ResourceNotFoundError: |
|
99
|
|
|
return None |
|
100
|
|
|
kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'], |
|
101
|
|
|
resolve_preset_file=resolve) |
|
102
|
|
|
else: |
|
103
|
|
|
kwargs['parameter'] = {} |
|
104
|
|
|
# Merge parameter overrides and parameters |
|
105
|
|
|
if 'parameter_override' in kwargs: |
|
106
|
|
|
set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override')) |
|
107
|
|
|
# Assert -I / -O |
|
108
|
|
|
if not kwargs.get('input_file_grp', None): |
|
109
|
|
|
raise ValueError('-I/--input-file-grp is required') |
|
110
|
|
|
if 'output_file_grp' not in kwargs: |
|
111
|
|
|
raise ValueError('-O/--output-file-grp is required') # actually, it may be None |
|
112
|
|
|
resolver = Resolver() |
|
113
|
|
|
working_dir, mets, _, mets_server_url = \ |
|
114
|
|
|
resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) |
|
115
|
|
|
workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) |
|
116
|
|
|
page_id = kwargs.get('page_id') |
|
117
|
|
|
if debug: |
|
118
|
|
|
config.OCRD_MISSING_INPUT = 'ABORT' |
|
119
|
|
|
config.OCRD_MISSING_OUTPUT = 'ABORT' |
|
120
|
|
|
config.OCRD_EXISTING_OUTPUT = 'ABORT' |
|
121
|
|
|
if overwrite: |
|
122
|
|
|
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' |
|
123
|
|
|
report = WorkspaceValidator.check_file_grp(workspace, |
|
124
|
|
|
kwargs['input_file_grp'], |
|
125
|
|
|
'' if overwrite else kwargs['output_file_grp'], |
|
126
|
|
|
page_id) |
|
127
|
|
|
if not report.is_valid: |
|
128
|
|
|
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) |
|
129
|
|
|
# Set up profiling behavior from environment variables/flags |
|
130
|
|
|
if not profile and 'CPU' in config.OCRD_PROFILE: |
|
131
|
|
|
profile = True |
|
132
|
|
|
if not profile_file and config.is_set('OCRD_PROFILE_FILE'): |
|
133
|
|
|
profile_file = config.OCRD_PROFILE_FILE |
|
134
|
|
|
if profile or profile_file: |
|
135
|
|
|
import cProfile |
|
136
|
|
|
import pstats |
|
137
|
|
|
import io |
|
138
|
|
|
import atexit |
|
139
|
|
|
print("Profiling...") |
|
140
|
|
|
pr = cProfile.Profile() |
|
141
|
|
|
pr.enable() |
|
142
|
|
|
|
|
143
|
|
|
def goexit(): |
|
144
|
|
|
pr.disable() |
|
|
|
|
|
|
145
|
|
|
print("Profiling completed") |
|
146
|
|
|
if profile_file: |
|
147
|
|
|
pr.dump_stats(profile_file) |
|
148
|
|
|
s = io.StringIO() |
|
|
|
|
|
|
149
|
|
|
pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() |
|
|
|
|
|
|
150
|
|
|
print(s.getvalue()) |
|
151
|
|
|
|
|
152
|
|
|
atexit.register(goexit) |
|
153
|
|
|
if log_filename: |
|
154
|
|
|
log_ctx = redirect_stderr_and_stdout_to_file(log_filename) |
|
155
|
|
|
else: |
|
156
|
|
|
log_ctx = nullcontext() |
|
157
|
|
|
with log_ctx: |
|
158
|
|
|
run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) |
|
159
|
|
|
|
|
160
|
|
|
|
|
161
|
|
|
def check_and_run_processing_worker(ProcessorClass, database: str, queue: str): |
|
162
|
|
|
""" Check/start Processing Worker for the WebAPI architecture |
|
163
|
|
|
""" |
|
164
|
|
|
from ocrd_network import ProcessingWorker |
|
165
|
|
|
|
|
166
|
|
|
if not database: |
|
167
|
|
|
raise ValueError("Option '--database' is required for the Processing Worker") |
|
168
|
|
|
if not queue: |
|
169
|
|
|
raise ValueError("Option '--queue' is required for the Processing Worker") |
|
170
|
|
|
|
|
171
|
|
|
processor = ProcessorClass(workspace=None) |
|
172
|
|
|
processing_worker = ProcessingWorker( |
|
173
|
|
|
rabbitmq_addr=queue, |
|
174
|
|
|
mongodb_addr=database, |
|
175
|
|
|
processor_name=processor.ocrd_tool['executable'], |
|
176
|
|
|
ocrd_tool=processor.ocrd_tool, |
|
177
|
|
|
processor_class=ProcessorClass, |
|
178
|
|
|
) |
|
179
|
|
|
# The RMQConsumer is initialized and a connection to the RabbitMQ is performed |
|
180
|
|
|
processing_worker.connect_consumer() |
|
181
|
|
|
# Start consuming from the queue with name `processor_name` |
|
182
|
|
|
processing_worker.start_consuming() |
|
183
|
|
|
sys.exit(0) |
|
184
|
|
|
|