Passed
Pull Request — master (#1240)
by Konstantin
03:00
created

ocrd_utils.config._parser_boolean()   A

Complexity

Conditions 2

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Most behavior of OCR-D is controlled via command-line flags or keyword args.
3
Some behavior is global or too cumbersome to handle via explicit code and
4
better solved by using environment variables.
5
6
OcrdEnvConfig is a base class to make this more streamlined, to be subclassed
7
in the `ocrd` package for the actual values
8
"""
9
10
from os import environ
11
from pathlib import Path
12
from tempfile import gettempdir
13
from textwrap import fill, indent
14
15
16
def _validator_boolean(val):
17
    return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
18
19
def _parser_boolean(val):
20
    return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
21
22
class OcrdEnvVariable():
23
24
    def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]):
25
        """
26
        An environment variable for use in OCR-D.
27
28
        Args:
29
            name (str): Name of the environment variable
30
            description (str): Description of what the variable is used for.
31
32
        Keyword Args:
33
            parser (callable): Function to transform the raw (string) value to whatever is needed.
34
            validator (callable): Function to validate that the raw (string) value is parseable.
35
            default (tuple(bool, any)): 2-tuple, first element is a bool whether there is a default
36
                value defined and second element contains that default value, which can be a callable
37
                for deferred evaluation
38
        """
39
        self.name = name
40
        self.description = description
41
        self.parser = parser
42
        self.validator = validator
43
        self.has_default = default[0]
44
        self.default = default[1]
45
46
    def __str__(self):
47
        return f'{self.name}: {self.description}'
48
49
    def describe(self, wrap_text=True, indent_text=True):
50
        desc = self.description
51
        if self.has_default:
52
            default = self.default() if callable(self.default) else self.default
53
            desc += f' (Default: "{default}")'
54
        ret = ''
55
        ret  = f'{self.name}\n'
56
        if wrap_text:
57
            desc = fill(desc, width=50)
58
        if indent_text:
59
            ret = f'  {ret}'
60
            desc = indent(desc, '    ')
61
        return ret + desc
62
63
class OcrdEnvConfig():
64
65
    def __init__(self):
66
        self._variables = {}
67
68
    def add(self, name, *args, **kwargs):
69
        var = OcrdEnvVariable(name, *args, **kwargs)
70
        # make visible in ocrd_utils.config docstring (apidoc)
71
        txt = var.describe(wrap_text=False, indent_text=True)
72
        globals()['__doc__'] += "\n\n - " + txt + "\n\n"
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable globals does not seem to be defined.
Loading history...
73
        self._variables[name] = var
74
        return self._variables[name]
75
76
    def has_default(self, name):
77
        if not name in self._variables:
78
            raise ValueError(f"Unregistered env variable {name}")
79
        return self._variables[name].has_default
80
81
    def describe(self, name, *args, **kwargs):
82
        if not name in self._variables:
83
            raise ValueError(f"Unregistered env variable {name}")
84
        return self._variables[name].describe(*args, **kwargs)
85
86
    def __getattr__(self, name):
87
        if not name in self._variables:
88
            raise ValueError(f"Unregistered env variable {name}")
89
        var_obj = self._variables[name]
90
        try:
91
            raw_value = self.raw_value(name)
92
        except KeyError as e:
93
            if var_obj.has_default:
94
                raw_value = var_obj.default() if callable(var_obj.default) else var_obj.default
95
            else:
96
                raise e
97
        if not var_obj.validator(raw_value):
98
            raise ValueError(f"'{name}' set to invalid value '{raw_value}'")
99
        return var_obj.parser(raw_value)
100
101
    def is_set(self, name):
102
        if not name in self._variables:
103
            raise ValueError(f"Unregistered env variable {name}")
104
        return name in environ
105
106
    def raw_value(self, name):
107
        if not name in self._variables:
108
            raise ValueError(f"Unregistered env variable {name}")
109
        return environ[name]
110
111
config = OcrdEnvConfig()
112
113
config.add('OCRD_METS_CACHING',
114
    description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.',
115
    validator=_validator_boolean,
116
    parser=_parser_boolean)
117
118
config.add('OCRD_MAX_PROCESSOR_CACHE',
119
    description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.",
120
    parser=int,
121
    default=(True, 128))
122
123
config.add('OCRD_MAX_PARALLEL_PAGES',
124
    description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
125
    parser=int,
126
    default=(True, 1))
127
128
config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
129
    description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
130
    parser=int,
131
    default=(True, 0))
132
133
config.add("OCRD_PROFILE",
134
    description="""\
135
Whether to enable gathering runtime statistics
136
on the `ocrd.profile` logger (comma-separated):
137
138
- `CPU`: yields CPU and wall-time,
139
- `RSS`: also yields peak memory (resident set size)
140
- `PSS`: also yields peak memory (proportional set size)
141
142
""",
143
  validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
144
  default=(True, ''))
145
146
config.add("OCRD_PROFILE_FILE",
147
    description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz")
148
149
config.add("OCRD_DOWNLOAD_RETRIES",
150
    description="Number of times to retry failed attempts for downloads of resources or workspace files.",
151
    validator=int,
152
    parser=int)
153
154
def _ocrd_download_timeout_parser(val):
155
    timeout = val.split(',')
156
    if len(timeout) > 1:
157
        timeout = tuple(float(x) for x in timeout)
158
    else:
159
        timeout = float(timeout[0])
160
    return timeout
161
162
config.add("OCRD_DOWNLOAD_TIMEOUT",
163
    description="Timeout in seconds for connecting or reading (comma-separated) when downloading.",
164
    parser=_ocrd_download_timeout_parser)
165
166
config.add("OCRD_DOWNLOAD_INPUT",
167
    description="Whether to download files not present locally during processing",
168
    default=(True, True),
169
    validator=_validator_boolean,
170
    parser=_parser_boolean)
171
172
config.add("OCRD_MISSING_INPUT",
173
    description="""\
174
How to deal with missing input files (for some fileGrp/pageId) during processing:
175
176
 - `SKIP`: ignore and proceed with next page's input
177
 - `ABORT`: throw :py:class:`.MissingInputFile`
178
179
""",
180
    default=(True, 'SKIP'),
181
    validator=lambda val: val in ['SKIP', 'ABORT'],
182
    parser=str)
183
184
config.add("OCRD_MISSING_OUTPUT",
185
    description="""\
186
How to deal with missing output files (for some fileGrp/pageId) during processing:
187
188
 - `SKIP`: ignore and proceed processing next page
189
 - `COPY`: fall back to copying input PAGE to output fileGrp for page
190
 - `ABORT`: re-throw whatever caused processing to fail
191
192
""",
193
    default=(True, 'SKIP'),
194
    validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
195
    parser=str)
196
197
config.add("OCRD_MAX_MISSING_OUTPUTS",
198
    description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
199
    default=(True, 0.1),
200
    parser=float)
201
202
config.add("OCRD_EXISTING_OUTPUT",
203
    description="""\
204
How to deal with already existing output files (for some fileGrp/pageId) during processing:
205
206
 - `SKIP`: ignore and proceed processing next page
207
 - `OVERWRITE`: force writing result to output fileGrp for page
208
 - `ABORT`: re-throw :py:class:`FileExistsError`
209
210
""",
211
    default=(True, 'SKIP'),
212
    validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
213
    parser=str)
214
215
config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
216
        description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
217
        default=(True, ''))
218
219
config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
220
           description="How many seconds to sleep before trying again.",
221
           parser=int,
222
           default=(True, 30))
223
224
config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
225
           description="Timeout for a blocking ocrd network client (in seconds).",
226
           parser=int,
227
           default=(True, 3600))
228
229
config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
230
        description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
231
        default=(True, ''))
232
233
config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE",
234
        description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).",
235
        default=(True, ''))
236
237
config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS",
238
    description="Number of attempts for a RabbitMQ client to connect before failing.",
239
    parser=int,
240
    default=(True, 3))
241
242
config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR",
243
           description="The root directory where all mets server related socket files are created",
244
           parser=lambda val: Path(val),
245
           default=(True, Path(gettempdir(), "ocrd_network_sockets")))
246
config.OCRD_NETWORK_SOCKETS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
247
248
config.add(name="OCRD_NETWORK_LOGS_ROOT_DIR",
249
           description="The root directory where all ocrd_network related file logs are stored",
250
           parser=lambda val: Path(val),
251
           default=(True, Path(gettempdir(), "ocrd_network_logs")))
252
config.OCRD_NETWORK_LOGS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
253
254
config.add("HOME",
255
    description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.",
256
    # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html",
257
    validator=lambda val: Path(val).is_dir(),
258
    parser=lambda val: Path(val),
259
    default=(True, lambda: Path.home()))
260
261
config.add("XDG_DATA_HOME",
262
    description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
263
    parser=lambda val: Path(val),
264
    default=(True, lambda: Path(config.HOME, '.local/share')))
265
266
config.add("XDG_CONFIG_HOME",
267
    description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
268
    parser=lambda val: Path(val),
269
    default=(True, lambda: Path(config.HOME, '.config')))
270
271
config.add("OCRD_LOGGING_DEBUG",
272
    description="Print information about the logging setup to STDERR",
273
    default=(True, False),
274
    validator=_validator_boolean,
275
    parser=_parser_boolean)
276