Passed
Pull Request — master (#1240)
by Konstantin
03:20
created

ocrd_utils.config._parser_boolean()   A

Complexity

Conditions 2

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Most behavior of OCR-D is controlled via command-line flags or keyword args.
3
Some behavior is global or too cumbersome to handle via explicit code and
4
better solved by using environment variables.
5
6
OcrdEnvConfig is a base class to make this more streamlined, to be subclassed
7
in the `ocrd` package for the actual values
8
"""
9
10
from os import environ
11
from pathlib import Path
12
from tempfile import gettempdir
13
from textwrap import fill, indent
14
15
16
def _validator_boolean(val):
17
    return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
18
19
def _parser_boolean(val):
20
    return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
21
22
class OcrdEnvVariable():
23
24
    def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]):
25
        """
26
        An environment variable for use in OCR-D.
27
28
        Args:
29
            name (str): Name of the environment variable
30
            description (str): Description of what the variable is used for.
31
32
        Keyword Args:
33
            parser (callable): Function to transform the raw (string) value to whatever is needed.
34
            validator (callable): Function to validate that the raw (string) value is parseable.
35
            default (tuple(bool, any)): 2-tuple, first element is a bool whether there is a default
36
                value defined and second element contains that default value, which can be a callable
37
                for deferred evaluation
38
        """
39
        self.name = name
40
        self.description = description
41
        self.parser = parser
42
        self.validator = validator
43
        self.has_default = default[0]
44
        self.default = default[1]
45
46
    def __str__(self):
47
        return f'{self.name}: {self.description}'
48
49
    def describe(self, wrap_text=True, indent_text=True):
50
        desc = self.description
51
        if self.has_default:
52
            default = self.default() if callable(self.default) else self.default
53
            desc += f' (Default: "{default}")'
54
        ret = ''
55
        ret  = f'{self.name}\n'
56
        if wrap_text:
57
            desc = fill(desc, width=50)
58
        if indent_text:
59
            ret = f'  {ret}'
60
            desc = indent(desc, '    ')
61
        return ret + desc
62
63
class OcrdEnvConfig():
64
65
    def __init__(self):
66
        self._variables = {}
67
68
    def add(self, name, *args, **kwargs):
69
        var = OcrdEnvVariable(name, *args, **kwargs)
70
        # make visible in ocrd_utils.config docstring (apidoc)
71
        txt = var.describe(wrap_text=False, indent_text=True)
72
        globals()['__doc__'] += "\n\n - " + txt + "\n\n"
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable globals does not seem to be defined.
Loading history...
73
        self._variables[name] = var
74
        return self._variables[name]
75
76
    def has_default(self, name):
77
        if not name in self._variables:
78
            raise ValueError(f"Unregistered env variable {name}")
79
        return self._variables[name].has_default
80
81
    def reset_defaults(self):
82
        for name in self._variables:
83
            try:
84
                # we cannot use hasattr, because that delegates to getattr,
85
                # which we override and provide defaults for (which of course
86
                # cannot be removed)
87
                if self.__getattribute__(name):
88
                    delattr(self, name)
89
            except AttributeError:
90
                pass
91
92
    def describe(self, name, *args, **kwargs):
93
        if not name in self._variables:
94
            raise ValueError(f"Unregistered env variable {name}")
95
        return self._variables[name].describe(*args, **kwargs)
96
97
    def __getattr__(self, name):
98
        # will be called if name is not accessible (has not been added directly yet)
99
        if not name in self._variables:
100
            raise AttributeError(f"Unregistered env variable {name}")
101
        var_obj = self._variables[name]
102
        try:
103
            raw_value = self.raw_value(name)
104
        except KeyError as e:
105
            if var_obj.has_default:
106
                raw_value = var_obj.default() if callable(var_obj.default) else var_obj.default
107
            else:
108
                raise e
109
        if not var_obj.validator(raw_value):
110
            raise ValueError(f"'{name}' set to invalid value '{raw_value}'")
111
        return var_obj.parser(raw_value)
112
113
    def is_set(self, name):
114
        if not name in self._variables:
115
            raise ValueError(f"Unregistered env variable {name}")
116
        return name in environ
117
118
    def raw_value(self, name):
119
        if not name in self._variables:
120
            raise ValueError(f"Unregistered env variable {name}")
121
        return environ[name]
122
123
config = OcrdEnvConfig()
124
125
config.add('OCRD_METS_CACHING',
126
    description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.',
127
    validator=_validator_boolean,
128
    parser=_parser_boolean)
129
130
config.add('OCRD_MAX_PROCESSOR_CACHE',
131
    description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.",
132
    parser=int,
133
    default=(True, 128))
134
135
config.add('OCRD_MAX_PARALLEL_PAGES',
136
    description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
137
    parser=int,
138
    default=(True, 1))
139
140
config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
141
    description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
142
    parser=int,
143
    default=(True, 0))
144
145
config.add("OCRD_PROFILE",
146
    description="""\
147
Whether to enable gathering runtime statistics
148
on the `ocrd.profile` logger (comma-separated):
149
150
- `CPU`: yields CPU and wall-time,
151
- `RSS`: also yields peak memory (resident set size)
152
- `PSS`: also yields peak memory (proportional set size)
153
154
""",
155
  validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
156
  default=(True, ''))
157
158
config.add("OCRD_PROFILE_FILE",
159
    description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz")
160
161
config.add("OCRD_DOWNLOAD_RETRIES",
162
    description="Number of times to retry failed attempts for downloads of resources or workspace files.",
163
    validator=int,
164
    parser=int)
165
166
def _ocrd_download_timeout_parser(val):
167
    timeout = val.split(',')
168
    if len(timeout) > 1:
169
        timeout = tuple(float(x) for x in timeout)
170
    else:
171
        timeout = float(timeout[0])
172
    return timeout
173
174
config.add("OCRD_DOWNLOAD_TIMEOUT",
175
    description="Timeout in seconds for connecting or reading (comma-separated) when downloading.",
176
    parser=_ocrd_download_timeout_parser)
177
178
config.add("OCRD_DOWNLOAD_INPUT",
179
    description="Whether to download files not present locally during processing",
180
    default=(True, True),
181
    validator=_validator_boolean,
182
    parser=_parser_boolean)
183
184
config.add("OCRD_MISSING_INPUT",
185
    description="""\
186
How to deal with missing input files (for some fileGrp/pageId) during processing:
187
188
 - `SKIP`: ignore and proceed with next page's input
189
 - `ABORT`: throw :py:class:`.MissingInputFile`
190
191
""",
192
    default=(True, 'SKIP'),
193
    validator=lambda val: val in ['SKIP', 'ABORT'],
194
    parser=str)
195
196
config.add("OCRD_MISSING_OUTPUT",
197
    description="""\
198
How to deal with missing output files (for some fileGrp/pageId) during processing:
199
200
 - `SKIP`: ignore and proceed processing next page
201
 - `COPY`: fall back to copying input PAGE to output fileGrp for page
202
 - `ABORT`: re-throw whatever caused processing to fail
203
204
""",
205
    default=(True, 'SKIP'),
206
    validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
207
    parser=str)
208
209
config.add("OCRD_MAX_MISSING_OUTPUTS",
210
    description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
211
    default=(True, 0.1),
212
    parser=float)
213
214
config.add("OCRD_EXISTING_OUTPUT",
215
    description="""\
216
How to deal with already existing output files (for some fileGrp/pageId) during processing:
217
218
 - `SKIP`: ignore and proceed processing next page
219
 - `OVERWRITE`: force writing result to output fileGrp for page
220
 - `ABORT`: re-throw :py:class:`FileExistsError`
221
222
""",
223
    default=(True, 'SKIP'),
224
    validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
225
    parser=str)
226
227
config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
228
        description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
229
        default=(True, ''))
230
231
config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
232
           description="How many seconds to sleep before trying again.",
233
           parser=int,
234
           default=(True, 30))
235
236
config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
237
           description="Timeout for a blocking ocrd network client (in seconds).",
238
           parser=int,
239
           default=(True, 3600))
240
241
config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
242
        description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
243
        default=(True, ''))
244
245
config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE",
246
        description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).",
247
        default=(True, ''))
248
249
config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS",
250
    description="Number of attempts for a RabbitMQ client to connect before failing.",
251
    parser=int,
252
    default=(True, 3))
253
254
config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR",
255
           description="The root directory where all mets server related socket files are created",
256
           parser=lambda val: Path(val),
257
           default=(True, Path(gettempdir(), "ocrd_network_sockets")))
258
config.OCRD_NETWORK_SOCKETS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
259
260
config.add(name="OCRD_NETWORK_LOGS_ROOT_DIR",
261
           description="The root directory where all ocrd_network related file logs are stored",
262
           parser=lambda val: Path(val),
263
           default=(True, Path(gettempdir(), "ocrd_network_logs")))
264
config.OCRD_NETWORK_LOGS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
265
266
config.add("HOME",
267
    description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.",
268
    # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html",
269
    validator=lambda val: Path(val).is_dir(),
270
    parser=lambda val: Path(val),
271
    default=(True, lambda: Path.home()))
272
273
config.add("XDG_DATA_HOME",
274
    description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
275
    parser=lambda val: Path(val),
276
    default=(True, lambda: Path(config.HOME, '.local/share')))
277
278
config.add("XDG_CONFIG_HOME",
279
    description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
280
    parser=lambda val: Path(val),
281
    default=(True, lambda: Path(config.HOME, '.config')))
282
283
config.add("OCRD_LOGGING_DEBUG",
284
    description="Print information about the logging setup to STDERR",
285
    default=(True, False),
286
    validator=_validator_boolean,
287
    parser=_parser_boolean)
288