Passed
Pull Request — master (#1240)
by
unknown
02:48
created

ocrd_utils.config.OcrdEnvConfig.__init__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Most behavior of OCR-D is controlled via command-line flags or keyword args.
3
Some behavior is global or too cumbersome to handle via explicit code and
4
better solved by using environment variables.
5
6
OcrdEnvConfig is a base class to make this more streamlined, to be subclassed
7
in the `ocrd` package for the actual values
8
"""
9
10
from os import environ
11
from pathlib import Path
12
from tempfile import gettempdir
13
from textwrap import fill, indent
14
15
16
def _validator_boolean(val):
17
    return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
18
19
def _parser_boolean(val):
20
    return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
21
22
class OcrdEnvVariable():
23
24
    def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]):
25
        """
26
        An environment variable for use in OCR-D.
27
28
        Args:
29
            name (str): Name of the environment variable
30
            description (str): Description of what the variable is used for.
31
32
        Keyword Args:
33
            parser (callable): Function to transform the raw (string) value to whatever is needed.
34
            validator (callable): Function to validate that the raw (string) value is parseable.
35
            default (tuple(bool, any)): 2-tuple, first element is a bool whether there is a default
36
                value defined and second element contains that default value, which can be a callable
37
                for deferred evaluation
38
        """
39
        self.name = name
40
        self.description = description
41
        self.parser = parser
42
        self.validator = validator
43
        self.has_default = default[0]
44
        self.default = default[1]
45
46
    def __str__(self):
47
        return f'{self.name}: {self.description}'
48
49
    def describe(self, wrap_text=True, indent_text=True):
50
        desc = self.description
51
        if self.has_default:
52
            default = self.default() if callable(self.default) else self.default
53
            desc += f' (Default: "{default}")'
54
        ret = ''
55
        ret  = f'{self.name}\n'
56
        if wrap_text:
57
            desc = fill(desc, width=50)
58
        if indent_text:
59
            ret = f'  {ret}'
60
            desc = indent(desc, '    ')
61
        return ret + desc
62
63
class OcrdEnvConfig():
64
65
    def __init__(self):
66
        self._variables = {}
67
68
    def add(self, name, *args, **kwargs):
69
        var = OcrdEnvVariable(name, *args, **kwargs)
70
        # make visible in ocrd_utils.config docstring (apidoc)
71
        txt = var.describe(wrap_text=False, indent_text=True)
72
        globals()['__doc__'] += "\n\n - " + txt + "\n\n"
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable globals does not seem to be defined.
Loading history...
73
        self._variables[name] = var
74
        return self._variables[name]
75
76
    def has_default(self, name):
77
        if not name in self._variables:
78
            raise ValueError(f"Unregistered env variable {name}")
79
        return self._variables[name].has_default
80
81
    def describe(self, name, *args, **kwargs):
82
        if not name in self._variables:
83
            raise ValueError(f"Unregistered env variable {name}")
84
        return self._variables[name].describe(*args, **kwargs)
85
86
    def __getattr__(self, name):
87
        if not name in self._variables:
88
            raise ValueError(f"Unregistered env variable {name}")
89
        var_obj = self._variables[name]
90
        try:
91
            raw_value = self.raw_value(name)
92
        except KeyError as e:
93
            if var_obj.has_default:
94
                raw_value = var_obj.default() if callable(var_obj.default) else var_obj.default
95
            else:
96
                raise e
97
        if not var_obj.validator(raw_value):
98
            raise ValueError(f"'{name}' set to invalid value '{raw_value}'")
99
        return var_obj.parser(raw_value)
100
101
    def is_set(self, name):
102
        if not name in self._variables:
103
            raise ValueError(f"Unregistered env variable {name}")
104
        return name in environ
105
106
    def raw_value(self, name):
107
        if not name in self._variables:
108
            raise ValueError(f"Unregistered env variable {name}")
109
        return environ[name]
110
111
config = OcrdEnvConfig()
112
113
config.add('OCRD_METS_CACHING',
114
    description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.',
115
    validator=_validator_boolean,
116
    parser=_parser_boolean)
117
118
config.add('OCRD_MAX_PROCESSOR_CACHE',
119
    description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.",
120
    parser=int,
121
    default=(True, 128))
122
123
config.add("OCRD_PROFILE",
124
    description="""\
125
Whether to enable gathering runtime statistics
126
on the `ocrd.profile` logger (comma-separated):
127
128
- `CPU`: yields CPU and wall-time,
129
- `RSS`: also yields peak memory (resident set size)
130
- `PSS`: also yields peak memory (proportional set size)
131
132
""",
133
  validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
134
  default=(True, ''))
135
136
config.add("OCRD_PROFILE_FILE",
137
    description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz")
138
139
config.add("OCRD_DOWNLOAD_RETRIES",
140
    description="Number of times to retry failed attempts for downloads of resources or workspace files.",
141
    validator=int,
142
    parser=int)
143
144
def _ocrd_download_timeout_parser(val):
145
    timeout = val.split(',')
146
    if len(timeout) > 1:
147
        timeout = tuple(float(x) for x in timeout)
148
    else:
149
        timeout = float(timeout[0])
150
    return timeout
151
152
config.add("OCRD_DOWNLOAD_TIMEOUT",
153
    description="Timeout in seconds for connecting or reading (comma-separated) when downloading.",
154
    parser=_ocrd_download_timeout_parser)
155
156
config.add("OCRD_DOWNLOAD_INPUT",
157
    description="Whether to download files not present locally during processing",
158
    default=(True, True),
159
    validator=_validator_boolean,
160
    parser=_parser_boolean)
161
162
config.add("OCRD_MISSING_INPUT",
163
    description="""\
164
How to deal with missing input files (for some fileGrp/pageId) during processing:
165
166
 - `SKIP`: ignore and proceed with next page's input
167
 - `ABORT`: throw :py:class:`.MissingInputFile`
168
169
""",
170
    default=(True, 'SKIP'),
171
    validator=lambda val: val in ['SKIP', 'ABORT'],
172
    parser=str)
173
174
config.add("OCRD_MISSING_OUTPUT",
175
    description="""\
176
How to deal with missing output files (for some fileGrp/pageId) during processing:
177
178
 - `SKIP`: ignore and proceed processing next page
179
 - `COPY`: fall back to copying input PAGE to output fileGrp for page
180
 - `ABORT`: re-throw whatever caused processing to fail
181
182
""",
183
    default=(True, 'SKIP'),
184
    validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
185
    parser=str)
186
187
config.add("OCRD_MAX_MISSING_OUTPUTS",
188
    description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
189
    default=(True, 0.1),
190
    parser=float)
191
192
config.add("OCRD_EXISTING_OUTPUT",
193
    description="""\
194
How to deal with already existing output files (for some fileGrp/pageId) during processing:
195
196
 - `SKIP`: ignore and proceed processing next page
197
 - `OVERWRITE`: force writing result to output fileGrp for page
198
 - `ABORT`: re-throw :py:class:`FileExistsError`
199
200
""",
201
    default=(True, 'SKIP'),
202
    validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
203
    parser=str)
204
205
config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
206
        description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
207
        default=(True, ''))
208
209
config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
210
           description="How many seconds to sleep before trying again.",
211
           parser=int,
212
           default=(True, 30))
213
214
config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
215
           description="Timeout for a blocking ocrd network client (in seconds).",
216
           parser=int,
217
           default=(True, 3600))
218
219
config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
220
        description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
221
        default=(True, ''))
222
223
config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE",
224
        description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).",
225
        default=(True, ''))
226
227
config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS",
228
    description="Number of attempts for a RabbitMQ client to connect before failing.",
229
    parser=int,
230
    default=(True, 3))
231
232
config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR",
233
           description="The root directory where all mets server related socket files are created",
234
           parser=lambda val: Path(val),
235
           default=(True, Path(gettempdir(), "ocrd_network_sockets")))
236
config.OCRD_NETWORK_SOCKETS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
237
238
config.add(name="OCRD_NETWORK_LOGS_ROOT_DIR",
239
           description="The root directory where all ocrd_network related file logs are stored",
240
           parser=lambda val: Path(val),
241
           default=(True, Path(gettempdir(), "ocrd_network_logs")))
242
config.OCRD_NETWORK_LOGS_ROOT_DIR.mkdir(parents=True, exist_ok=True)
243
244
config.add("HOME",
245
    description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.",
246
    # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html",
247
    validator=lambda val: Path(val).is_dir(),
248
    parser=lambda val: Path(val),
249
    default=(True, lambda: Path.home()))
250
251
config.add("XDG_DATA_HOME",
252
    description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)",
253
    parser=lambda val: Path(val),
254
    default=(True, lambda: Path(config.HOME, '.local/share')))
255
256
config.add("XDG_CONFIG_HOME",
257
    description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)",
258
    parser=lambda val: Path(val),
259
    default=(True, lambda: Path(config.HOME, '.config')))
260
261
config.add("OCRD_LOGGING_DEBUG",
262
    description="Print information about the logging setup to STDERR",
263
    default=(True, False),
264
    validator=_validator_boolean,
265
    parser=_parser_boolean)
266