Passed
Pull Request — master (#1240)
by Konstantin
03:00
created

ocrd.processor.helpers.run_cli()   D

Complexity

Conditions 12

Size

Total Lines 62
Code Lines 41

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 41
dl 0
loc 62
rs 4.8
c 0
b 0
f 0
cc 12
nop 14

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.processor.helpers.run_cli() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Helper methods for running and documenting processors
3
"""
4
from os import chdir, getcwd
5
from time import perf_counter, process_time
6
from functools import lru_cache
7
import json
8
import inspect
9
from subprocess import run
10
from typing import List
11
12
from click import wrap_text
13
from ocrd.workspace import Workspace
14
from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline
15
16
17
__all__ = [
18
    'generate_processor_help',
19
    'run_cli',
20
    'run_processor'
21
]
22
23
24
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None, mets_server_url=None):
25
    if workspace is None:
26
        if resolver is None:
27
            raise Exception("Need to pass a resolver to create a workspace")
28
        if mets_url is None:
29
            raise Exception("Need to pass mets_url to create a workspace")
30
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url)
31
    return workspace
32
33
def run_processor(
34
        processorClass,
35
        mets_url=None,
36
        resolver=None,
37
        workspace=None,
38
        page_id=None,
39
        log_level=None,
40
        input_file_grp=None,
41
        output_file_grp=None,
42
        show_resource=None,
43
        list_resources=False,
44
        parameter=None,
45
        parameter_override=None,
46
        working_dir=None,
47
        mets_server_url=None,
48
        instance_caching=False
49
): # pylint: disable=too-many-locals
50
    """
51
    Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
52
53
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
54
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
55
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
56
57
    Instantiate a Python object for :py:attr:`processorClass`, passing:
58
    - the workspace,
59
    - :py:attr:`page_id`
60
    - :py:attr:`input_file_grp`
61
    - :py:attr:`output_file_grp`
62
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
63
64
    Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects.
65
    This flag is used for an experimental feature we would like to adopt in future.
66
67
    Run the processor on the workspace (creating output files in the filesystem).
68
69
    Finally, write back the workspace (updating the METS in the filesystem).
70
71
    Args:
72
        processorClass (object): Python class of the module processor.
73
    """
74
    if log_level:
75
        setOverrideLogLevel(log_level)
76
    workspace = _get_workspace(
77
        workspace,
78
        resolver,
79
        mets_url,
80
        working_dir,
81
        mets_server_url
82
    )
83
    log = getLogger('ocrd.processor.helpers.run_processor')
84
    log.debug("Running processor %s", processorClass)
85
86
    processor = get_processor(
87
        processor_class=processorClass,
88
        parameter=parameter,
89
        workspace=None,
90
        page_id=page_id,
91
        input_file_grp=input_file_grp,
92
        output_file_grp=output_file_grp,
93
        instance_caching=instance_caching
94
    )
95
96
    ocrd_tool = processor.ocrd_tool
97
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
98
    otherrole = ocrd_tool['steps'][0]
99
    logProfile = getLogger('ocrd.process.profile')
100
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
101
    t0_wall = perf_counter()
102
    t0_cpu = process_time()
103
    if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
104
        backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
105
        from memory_profiler import memory_usage
106
        try:
107
            mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
108
                                     # only run process once
109
                                     max_iterations=1,
110
                                     interval=.1, timeout=None, timestamps=True,
111
                                     # include sub-processes
112
                                     multiprocess=True, include_children=True,
113
                                     # get proportional set size instead of RSS
114
                                     backend=backend)
115
        except Exception as err:
116
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
117
            raise err
118
        mem_usage_values = [mem for mem, _ in mem_usage]
119
        mem_output = 'memory consumption: '
120
        mem_output += sparkline(mem_usage_values)
121
        mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values))
122
        logProfile.info(mem_output)
123
    else:
124
        try:
125
            processor.process_workspace(workspace)
126
        except Exception as err:
127
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
128
            raise err
129
130
    t1_wall = perf_counter() - t0_wall
131
    t1_cpu = process_time() - t0_cpu
132
    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
133
        ocrd_tool['executable'],
134
        t1_wall,
135
        t1_cpu,
136
        processor.input_file_grp or '',
137
        processor.output_file_grp or '',
138
        json.dumps(processor.parameter) or '',
139
        processor.page_id or ''
140
    ))
141
    workspace.mets.add_agent(
142
        name=name,
143
        _type='OTHER',
144
        othertype='SOFTWARE',
145
        role='OTHER',
146
        otherrole=otherrole,
147
        notes=[({'option': 'input-file-grp'}, processor.input_file_grp or ''),
148
               ({'option': 'output-file-grp'}, processor.output_file_grp or ''),
149
               ({'option': 'parameter'}, json.dumps(processor.parameter or '')),
150
               ({'option': 'page-id'}, processor.page_id or '')]
151
    )
152
    workspace.save_mets()
153
    return processor
154
155
156
def run_cli(
157
        executable,
158
        mets_url=None,
159
        resolver=None,
160
        workspace=None,
161
        page_id=None,
162
        overwrite=None,
163
        debug=None,
164
        log_level=None,
165
        log_filename=None,
166
        input_file_grp=None,
167
        output_file_grp=None,
168
        parameter=None,
169
        working_dir=None,
170
        mets_server_url=None,
171
):
172
    """
173
    Open a workspace and run a processor on the command line.
174
175
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
176
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
177
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
178
179
    Run the processor CLI :py:attr:`executable` on the workspace, passing:
180
    - the workspace,
181
    - :py:attr:`page_id`
182
    - :py:attr:`input_file_grp`
183
    - :py:attr:`output_file_grp`
184
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
185
186
    (Will create output files and update the in the filesystem).
187
188
    Args:
189
        executable (string): Executable name of the module processor.
190
    """
191
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
192
    args = [executable, '--working-dir', workspace.directory]
193
    args += ['--mets', mets_url]
194
    if log_level:
195
        args += ['--log-level', log_level if isinstance(log_level, str) else getLevelName(log_level)]
196
    if page_id:
197
        args += ['--page-id', page_id]
198
    if input_file_grp:
199
        args += ['--input-file-grp', input_file_grp]
200
    if output_file_grp:
201
        args += ['--output-file-grp', output_file_grp]
202
    if parameter:
203
        args += ['--parameter', parameter]
204
    if overwrite:
205
        args += ['--overwrite']
206
    if debug:
207
        args += ['--debug']
208
    if mets_server_url:
209
        args += ['--mets-server-url', mets_server_url]
210
    log = getLogger('ocrd.processor.helpers.run_cli')
211
    log.debug("Running subprocess '%s'", ' '.join(args))
212
    if not log_filename:
213
        result = run(args, check=False)
214
    else:
215
        with open(log_filename, 'a') as file_desc:
216
            result = run(args, check=False, stdout=file_desc, stderr=file_desc)
217
    return result.returncode
218
219
220
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
221
    """Generate a string describing the full CLI of this processor including params.
222
223
    Args:
224
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
225
         processor_instance (object, optional): the processor implementation
226
             (for adding any module/class/function docstrings)
227
        subcommand (string): 'worker' or 'server'
228
    """
229
    doc_help = ''
230
    if processor_instance:
231
        module = inspect.getmodule(processor_instance)
232
        if module and module.__doc__:
233
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
234
        if processor_instance.__doc__:
235
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
236
        if processor_instance.process_workspace.__doc__:
237
            doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n'
238
        if processor_instance.process.__doc__:
239
            doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
240
        if doc_help:
241
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
242
                                          initial_indent='  > ',
243
                                          subsequent_indent='  > ',
244
                                          preserve_paragraphs=True)
245
    subcommands = '''\
246
    worker      Start a processing worker rather than do local processing
247
    server      Start a processor server rather than do local processing
248
'''
249
250
    processing_worker_options = '''\
251
  --queue                         The RabbitMQ server address in format
252
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
253
                                  [amqp://admin:admin@localhost:5672]
254
  --database                      The MongoDB server address in format
255
                                  "mongodb://{host}:{port}"
256
                                  [mongodb://localhost:27018]
257
  --log-filename                  Filename to redirect STDOUT/STDERR to,
258
                                  if specified.
259
'''
260
261
    processing_server_options = '''\
262
  --address                       The Processor server address in format
263
                                  "{host}:{port}"
264
  --database                      The MongoDB server address in format
265
                                  "mongodb://{host}:{port}"
266
                                  [mongodb://localhost:27018]
267
'''
268
269
    processing_options = '''\
270
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
271
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
272
  -I, --input-file-grp USE        File group(s) used as input
273
  -O, --output-file-grp USE       File group(s) used as output
274
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
275
  --overwrite                     Remove existing output pages/images
276
                                  (with "--page-id", remove only those).
277
                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
278
  --debug                         Abort on any errors with full stack trace.
279
                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
280
  --profile                       Enable profiling
281
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
282
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
283
                                  or JSON file path
284
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
285
                                  taking precedence over --parameter
286
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
287
                                  If URL starts with http:// start an HTTP server there,
288
                                  otherwise URL is a path to an on-demand-created unix socket
289
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
290
                                  Override log level globally [INFO]
291
'''
292
293
    information_options = '''\
294
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
295
  -L, --list-resources            List names of processor resources
296
  -J, --dump-json                 Dump tool description as JSON
297
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
298
  -h, --help                      Show this message
299
  -V, --version                   Show version
300
'''
301
302
    parameter_help = ''
303
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
304
        parameter_help = '  NONE\n'
305
    else:
306
        def wrap(s):
307
            return wrap_text(s, initial_indent=' '*3,
308
                             subsequent_indent=' '*4,
309
                             width=72, preserve_paragraphs=True)
310
        for param_name, param in ocrd_tool['parameters'].items():
311
            parameter_help += wrap('"%s" [%s%s]' % (
312
                param_name,
313
                param['type'],
314
                ' - REQUIRED' if 'required' in param and param['required'] else
315
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
316
            parameter_help += '\n ' + wrap(param['description'])
317
            if 'enum' in param:
318
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
319
            parameter_help += "\n"
320
321
    if not subcommand:
322
        return f'''\
323
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
324
325
  {ocrd_tool['description']}{doc_help}
326
327
Subcommands:
328
{subcommands}
329
Options for processing:
330
{processing_options}
331
Options for information:
332
{information_options}
333
Parameters:
334
{parameter_help}
335
'''
336
    elif subcommand == 'worker':
337
        return f'''\
338
Usage: {ocrd_tool['executable']} worker [OPTIONS]
339
340
  Run {ocrd_tool['executable']} as a processing worker.
341
342
  {ocrd_tool['description']}{doc_help}
343
344
Options:
345
{processing_worker_options}
346
'''
347
    elif subcommand == 'server':
348
        return f'''\
349
Usage: {ocrd_tool['executable']} server [OPTIONS]
350
351
  Run {ocrd_tool['executable']} as a processor sever.
352
353
  {ocrd_tool['description']}{doc_help}
354
355
Options:
356
{processing_server_options}
357
'''
358
    else:
359
        pass
360
361
362
# Taken from https://github.com/OCR-D/core/pull/884
363
@freeze_args
364
@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
365
def get_cached_processor(parameter: dict, processor_class):
366
    """
367
    Call this function to get back an instance of a processor.
368
    The results are cached based on the parameters.
369
    Args:
370
        parameter (dict): a dictionary of parameters.
371
        processor_class: the concrete `:py:class:~ocrd.Processor` class.
372
    Returns:
373
        When the concrete class of the processor is unknown, `None` is returned.
374
        Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
375
    """
376
    if processor_class:
377
        dict_params = dict(parameter) if parameter else None
378
        processor = processor_class(None, parameter=dict_params)
379
        processor.setup()
380
        return processor
381
    return None
382
383
384
def get_processor(
385
        processor_class,
386
        parameter: dict,
387
        workspace: Workspace = None,
388
        page_id: str = None,
389
        input_file_grp: List[str] = None,
390
        output_file_grp: List[str] = None,
391
        instance_caching: bool = False,
392
):
393
    if processor_class:
394
        if instance_caching:
395
            processor = get_cached_processor(parameter, processor_class)
396
        else:
397
            processor = processor_class(None, parameter=parameter)
398
            processor.setup()
399
        processor.workspace = workspace
400
        processor.page_id = page_id
401
        processor.input_file_grp = input_file_grp
402
        processor.output_file_grp = output_file_grp
403
        return processor
404
    raise ValueError("Processor class is not known")
405