Passed
Push — master ( 71c0c1...5d69e4 )
by Konstantin
02:52
created

ocrd_utils.config.OcrdEnvVariable.describe()   B

Complexity

Conditions 6

Size

Total Lines 22
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 15
dl 0
loc 22
rs 8.6666
c 0
b 0
f 0
cc 6
nop 3
1
"""
2
Most behavior of OCR-D is controlled via command-line flags or keyword args.
3
Some behavior is global or too cumbersome to handle via explicit code and
4
better solved by using environment variables.
5
6
OcrdEnvConfig is a base class to make this more streamlined, to be subclassed
7
in the `ocrd` package for the actual values
8
"""
9
10
from os import environ
11
from pathlib import Path
12
from tempfile import gettempdir
13
from textwrap import fill, indent
14
15
16
def _validator_boolean(val):
17
    return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
18
19
def _parser_boolean(val):
20
    return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
21
22
class OcrdEnvVariable():
23
24
    def __init__(self, name, description, parser=str, validator=lambda _: True, default=[False, None]):
25
        """
26
        An environment variable for use in OCR-D.
27
28
        Args:
29
            name (str): Name of the environment variable
30
            description (str): Description of what the variable is used for.
31
32
        Keyword Args:
33
            parser (callable): Function to transform the raw (string) value to whatever is needed.
34
            validator (callable): Function to validate that the raw (string) value is parseable.
35
            default (tuple(bool, any)): 2-tuple, first element is a bool whether there is a default
36
                value defined and second element contains that default value, which can be a callable
37
                for deferred evaluation
38
        """
39
        self.name = name
40
        self.description = description
41
        self.parser = parser
42
        self.validator = validator
43
        self.has_default = default[0]
44
        self.default = default[1]
45
46
    def __str__(self):
47
        return f'{self.name}: {self.description}'
48
49
    def describe(self, wrap_text=True, indent_text=True):
50
        """
51
        Output help information on a config option.
52
53
        If ``option.description`` is a multiline string with complex formatting
54
        (e.g. markdown lists), replace empty lines with ``\b`` and set
55
        ``wrap_text`` to ``False``.
56
        """
57
        desc = self.description
58
        if self.has_default:
59
            default = self.default() if callable(self.default) else self.default
60
            if not desc.endswith('\n'):
61
                desc += ' '
62
            desc += f'(Default: "{default}")'
63
        ret = ''
64
        ret  = f'{self.name}\n'
65
        if wrap_text:
66
            desc = fill(desc, width=50)
67
        if indent_text:
68
            ret = f'  {ret}'
69
            desc = indent(desc, '    ')
70
        return ret + desc
71
72
class OcrdEnvConfig():
73
74
    def __init__(self):
75
        self._variables = {}
76
77
    def add(self, name, *args, **kwargs):
78
        var = OcrdEnvVariable(name, *args, **kwargs)
79
        # make visible in ocrd_utils.config docstring (apidoc)
80
        txt = var.describe(wrap_text=False, indent_text=True)
81
        globals()['__doc__'] += "\n\n - " + txt + "\n\n"
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable globals does not seem to be defined.
Loading history...
82
        self._variables[name] = var
83
        return self._variables[name]
84
85
    def has_default(self, name):
86
        if not name in self._variables:
87
            raise ValueError(f"Unregistered env variable {name}")
88
        return self._variables[name].has_default
89
90
    def reset_defaults(self):
91
        for name in self._variables:
92
            try:
93
                # we cannot use hasattr, because that delegates to getattr,
94
                # which we override and provide defaults for (which of course
95
                # cannot be removed)
96
                if self.__getattribute__(name):
97
                    delattr(self, name)
98
            except AttributeError:
99
                pass
100
101
    def describe(self, name, *args, **kwargs):
102
        if not name in self._variables:
103
            raise ValueError(f"Unregistered env variable {name}")
104
        return self._variables[name].describe(*args, **kwargs)
105
106
    def __getattr__(self, name):
107
        # will be called if name is not accessible (has not been added directly yet)
108
        if not name in self._variables:
109
            raise AttributeError(f"Unregistered env variable {name}")
110
        var_obj = self._variables[name]
111
        try:
112
            raw_value = self.raw_value(name)
113
        except KeyError as e:
114
            if var_obj.has_default:
115
                raw_value = var_obj.default() if callable(var_obj.default) else var_obj.default
116
            else:
117
                raise e
118
        if not var_obj.validator(raw_value):
119
            raise ValueError(f"'{name}' set to invalid value '{raw_value}'")
120
        return var_obj.parser(raw_value)
121
122
    def is_set(self, name):
123
        if not name in self._variables:
124
            raise ValueError(f"Unregistered env variable {name}")
125
        return name in environ
126
127
    def raw_value(self, name):
128
        if not name in self._variables:
129
            raise ValueError(f"Unregistered env variable {name}")
130
        return environ[name]
131
132
config = OcrdEnvConfig()
133
134
config.add('OCRD_METS_CACHING',
135
    description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.',
136
    validator=_validator_boolean,
137
    parser=_parser_boolean)
138
139
config.add('OCRD_MAX_PROCESSOR_CACHE',
140
    description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.",
141
    parser=int,
142
    default=(True, 128))
143
144
config.add('OCRD_MAX_PARALLEL_PAGES',
145
    description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
146
    parser=int,
147
    default=(True, 1))
148
149
config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
150
    description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
151
    parser=int,
152
    default=(True, 0))
153
154
config.add("OCRD_PROFILE",
155
    description="""\
156
Whether to enable gathering runtime statistics
157
on the `ocrd.profile` logger (comma-separated):
158
\b
159
- `CPU`: yields CPU and wall-time,
160
- `RSS`: also yields peak memory (resident set size)
161
- `PSS`: also yields peak memory (proportional set size)
162
\b
163
""",
164
  validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
165
  default=(True, ''))
166
167
config.add("OCRD_PROFILE_FILE",
168
    description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz")
169
170
config.add("OCRD_DOWNLOAD_RETRIES",
171
    description="Number of times to retry failed attempts for downloads of resources or workspace files.",
172
    validator=int,
173
    parser=int)
174
175
def _ocrd_download_timeout_parser(val):
176
    timeout = val.split(',')
177
    if len(timeout) > 1:
178
        timeout = tuple(float(x) for x in timeout)
179
    else:
180
        timeout = float(timeout[0])
181
    return timeout
182
183
config.add("OCRD_DOWNLOAD_TIMEOUT",
184
    description="Timeout in seconds for connecting or reading (comma-separated) when downloading.",
185
    parser=_ocrd_download_timeout_parser)
186
187
config.add("OCRD_DOWNLOAD_INPUT",
188
    description="Whether to download files not present locally during processing",
189
    default=(True, True),
190
    validator=_validator_boolean,
191
    parser=_parser_boolean)
192
193
config.add("OCRD_MISSING_INPUT",
194
    description="""\
195
How to deal with missing input files
196
(for some fileGrp/pageId) during processing:
197
\b
198
 - `SKIP`: ignore and proceed with next page's input
199
 - `ABORT`: throw :py:class:`.MissingInputFile`
200
\b
201
""",
202
    default=(True, 'SKIP'),
203
    validator=lambda val: val in ['SKIP', 'ABORT'],
204
    parser=str)
205
206
config.add("OCRD_MISSING_OUTPUT",
207
    description="""\
208
How to deal with missing output files
209
(for some fileGrp/pageId) during processing:
210
\b
211
 - `SKIP`: ignore and proceed processing next page
212
 - `COPY`: fall back to copying input PAGE to output fileGrp for page
213
 - `ABORT`: re-throw whatever caused processing to fail
214
\b
215
""",
216
    default=(True, 'SKIP'),
217
    validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
218
    parser=str)
219
220
config.add("OCRD_MAX_MISSING_OUTPUTS",
221
    description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
222
    default=(True, 0.1),
223
    parser=float)
224
225
config.add("OCRD_EXISTING_OUTPUT",
226
    description="""\
227
How to deal with already existing output files
228
(for some fileGrp/pageId) during processing:
229
\b
230
 - `SKIP`: ignore and proceed processing next page
231
 - `OVERWRITE`: force writing result to output fileGrp for page
232
 - `ABORT`: re-throw :py:class:`FileExistsError`
233
\b
234
""",
235
    default=(True, 'SKIP'),
236
    validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
237
    parser=str)
238
239
config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
240
        description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
241
        default=(True, ''))
242
243
config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
244
           description="How many seconds to sleep before trying again.",
245
           parser=int,
246
           default=(True, 10))
247
248
config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
249
           description="Timeout for a blocking ocrd network client (in seconds).",
250
           parser=int,
251
           default=(True, 3600))
252
253
config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
254
        description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
255
        default=(True, ''))
256
257
config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE",
258
        description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).",
259
        default=(True, ''))
260
261
config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS",
262
           description="Number of attempts for a RabbitMQ client to connect before failing.",
263
           parser=int,
264
           default=(True, 3))
265
266
config.add(
267
    name="OCRD_NETWORK_RABBITMQ_HEARTBEAT",
268
    description="""
269
    Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value 
270
    proposed by broker. Use 0 to deactivate heartbeat.
271
    """,
272
    parser=int,
273
    default=(True, 0)
274
)
275
276
config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR",
277
           description="The root directory where all mets server related socket files are created",
278
           parser=lambda val: Path(val),
279
           default=(True, Path(gettempdir(), "ocrd_network_sockets")))
280
config.OCRD_NETWORK_SOCKETS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
281
282
config.add(name="OCRD_NETWORK_LOGS_ROOT_DIR",
283
           description="The root directory where all ocrd_network related file logs are stored",
284
           parser=lambda val: Path(val),
285
           default=(True, Path(gettempdir(), "ocrd_network_logs")))
286
config.OCRD_NETWORK_LOGS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
287
288
config.add("HOME",
289
    description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.",
290
    # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html",
291
    validator=lambda val: Path(val).is_dir(),
292
    parser=lambda val: Path(val),
293
    default=(True, lambda: Path.home()))
294
295
config.add("XDG_DATA_HOME",
296
    description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
297
    parser=lambda val: Path(val),
298
    default=(True, lambda: Path(config.HOME, '.local/share')))
299
300
config.add("XDG_CONFIG_HOME",
301
    description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
302
    parser=lambda val: Path(val),
303
    default=(True, lambda: Path(config.HOME, '.config')))
304
305
config.add("OCRD_LOGGING_DEBUG",
306
    description="Print information about the logging setup to STDERR",
307
    default=(True, False),
308
    validator=_validator_boolean,
309
    parser=_parser_boolean)
310