Passed
Push — master ( caa7ac...421b06 )
by Konstantin
02:17 queued 01:26
created

ocrd.processor.helpers.run_cli()   C

Complexity

Conditions 9

Size

Total Lines 54
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 34
dl 0
loc 54
rs 6.6666
c 0
b 0
f 0
cc 9
nop 12

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Helper methods for running and documenting processors
3
"""
4
from os import chdir, getcwd
5
from time import perf_counter, process_time
6
from functools import lru_cache
7
import json
8
import inspect
9
from subprocess import run
10
from typing import List
11
12
from click import wrap_text
13
from ocrd.workspace import Workspace
14
from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName
15
16
17
__all__ = [
18
    'generate_processor_help',
19
    'run_cli',
20
    'run_processor'
21
]
22
23
24
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None, mets_server_url=None):
25
    if workspace is None:
26
        if resolver is None:
27
            raise Exception("Need to pass a resolver to create a workspace")
28
        if mets_url is None:
29
            raise Exception("Need to pass mets_url to create a workspace")
30
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url)
31
    return workspace
32
33
def run_processor(
34
        processorClass,
35
        mets_url=None,
36
        resolver=None,
37
        workspace=None,
38
        page_id=None,
39
        log_level=None,
40
        input_file_grp=None,
41
        output_file_grp=None,
42
        show_resource=None,
43
        list_resources=False,
44
        parameter=None,
45
        parameter_override=None,
46
        working_dir=None,
47
        mets_server_url=None,
48
        instance_caching=False  # TODO don't set this yet!
49
): # pylint: disable=too-many-locals
50
    """
51
    Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
52
53
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
54
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
55
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
56
57
    Instantiate a Python object for :py:attr:`processorClass`, passing:
58
    - the workspace,
59
    - :py:attr:`page_id`
60
    - :py:attr:`input_file_grp`
61
    - :py:attr:`output_file_grp`
62
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
63
64
    Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects.
65
    This flag is used for an experimental feature we would like to adopt in future.
66
67
    Run the processor on the workspace (creating output files in the filesystem).
68
69
    Finally, write back the workspace (updating the METS in the filesystem).
70
71
    Args:
72
        processorClass (object): Python class of the module processor.
73
    """
74
    if log_level:
75
        setOverrideLogLevel(log_level)
76
    workspace = _get_workspace(
77
        workspace,
78
        resolver,
79
        mets_url,
80
        working_dir,
81
        mets_server_url
82
    )
83
    log = getLogger('ocrd.processor.helpers.run_processor')
84
    log.debug("Running processor %s", processorClass)
85
86
    old_cwd = getcwd()
87
    processor = get_processor(
88
        processor_class=processorClass,
89
        parameter=parameter,
90
        workspace=None,
91
        page_id=page_id,
92
        input_file_grp=input_file_grp,
93
        output_file_grp=output_file_grp,
94
        instance_caching=instance_caching
95
    )
96
    processor.workspace = workspace
97
    chdir(processor.workspace.directory)
98
99
    ocrd_tool = processor.ocrd_tool
100
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
101
    otherrole = ocrd_tool['steps'][0]
102
    logProfile = getLogger('ocrd.process.profile')
103
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
104
    t0_wall = perf_counter()
105
    t0_cpu = process_time()
106
    if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
107
        backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
108
        from memory_profiler import memory_usage
109
        from sparklines import sparklines
110
        try:
111
            mem_usage = memory_usage(proc=processor.process,
112
                                     # only run process once
113
                                     max_iterations=1,
114
                                     interval=.1, timeout=None, timestamps=True,
115
                                     # include sub-processes
116
                                     multiprocess=True, include_children=True,
117
                                     # get proportional set size instead of RSS
118
                                     backend=backend)
119
        except Exception as err:
120
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
121
            raise err
122
        finally:
123
            chdir(old_cwd)
124
        mem_usage_values = [mem for mem, _ in mem_usage]
125
        mem_output = 'memory consumption: '
126
        mem_output += ''.join(sparklines(mem_usage_values))
127
        mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values))
128
        logProfile.info(mem_output)
129
    else:
130
        try:
131
            processor.process()
132
        except Exception as err:
133
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
134
            raise err
135
        finally:
136
            chdir(old_cwd)
137
138
    t1_wall = perf_counter() - t0_wall
139
    t1_cpu = process_time() - t0_cpu
140
    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
141
        ocrd_tool['executable'],
142
        t1_wall,
143
        t1_cpu,
144
        processor.input_file_grp or '',
145
        processor.output_file_grp or '',
146
        json.dumps(processor.parameter) or '',
147
        processor.page_id or ''
148
    ))
149
    workspace.mets.add_agent(
150
        name=name,
151
        _type='OTHER',
152
        othertype='SOFTWARE',
153
        role='OTHER',
154
        otherrole=otherrole,
155
        notes=[({'option': 'input-file-grp'}, processor.input_file_grp or ''),
156
               ({'option': 'output-file-grp'}, processor.output_file_grp or ''),
157
               ({'option': 'parameter'}, json.dumps(processor.parameter or '')),
158
               ({'option': 'page-id'}, processor.page_id or '')]
159
    )
160
    workspace.save_mets()
161
    return processor
162
163
164
def run_cli(
165
        executable,
166
        mets_url=None,
167
        resolver=None,
168
        workspace=None,
169
        page_id=None,
170
        overwrite=None,
171
        log_level=None,
172
        input_file_grp=None,
173
        output_file_grp=None,
174
        parameter=None,
175
        working_dir=None,
176
        mets_server_url=None,
177
):
178
    """
179
    Open a workspace and run a processor on the command line.
180
181
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
182
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
183
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
184
185
    Run the processor CLI :py:attr:`executable` on the workspace, passing:
186
    - the workspace,
187
    - :py:attr:`page_id`
188
    - :py:attr:`input_file_grp`
189
    - :py:attr:`output_file_grp`
190
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
191
192
    (Will create output files and update the in the filesystem).
193
194
    Args:
195
        executable (string): Executable name of the module processor.
196
    """
197
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
198
    args = [executable, '--working-dir', workspace.directory]
199
    args += ['--mets', mets_url]
200
    if log_level:
201
        args += ['--log-level', log_level if isinstance(log_level, str) else getLevelName(log_level)]
202
    if page_id:
203
        args += ['--page-id', page_id]
204
    if input_file_grp:
205
        args += ['--input-file-grp', input_file_grp]
206
    if output_file_grp:
207
        args += ['--output-file-grp', output_file_grp]
208
    if parameter:
209
        args += ['--parameter', parameter]
210
    if overwrite:
211
        args += ['--overwrite']
212
    if mets_server_url:
213
        args += ['--mets-server-url', mets_server_url]
214
    log = getLogger('ocrd.processor.helpers.run_cli')
215
    log.debug("Running subprocess '%s'", ' '.join(args))
216
    result = run(args, check=False)
217
    return result.returncode
218
219
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
220
    """Generate a string describing the full CLI of this processor including params.
221
222
    Args:
223
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
224
         processor_instance (object, optional): the processor implementation
225
             (for adding any module/class/function docstrings)
226
        subcommand (string): 'worker' or 'server'
227
    """
228
    doc_help = ''
229
    if processor_instance:
230
        module = inspect.getmodule(processor_instance)
231
        if module and module.__doc__:
232
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
233
        if processor_instance.__doc__:
234
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
235
        if processor_instance.process.__doc__:
236
            doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
237
        if doc_help:
238
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
239
                                          initial_indent='  > ',
240
                                          subsequent_indent='  > ',
241
                                          preserve_paragraphs=True)
242
    subcommands = '''\
243
    worker      Start a processing worker rather than do local processing
244
    server      Start a processor server rather than do local processing
245
'''
246
247
    processing_worker_options = '''\
248
  --queue                         The RabbitMQ server address in format
249
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
250
                                  [amqp://admin:admin@localhost:5672]
251
  --database                      The MongoDB server address in format
252
                                  "mongodb://{host}:{port}"
253
                                  [mongodb://localhost:27018]
254
  --log-filename                  Filename to redirect STDOUT/STDERR to,
255
                                  if specified.
256
'''
257
258
    processing_server_options = '''\
259
  --address                       The Processor server address in format
260
                                  "{host}:{port}"
261
  --database                      The MongoDB server address in format
262
                                  "mongodb://{host}:{port}"
263
                                  [mongodb://localhost:27018]
264
'''
265
266
    processing_options = '''\
267
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
268
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
269
  -I, --input-file-grp USE        File group(s) used as input
270
  -O, --output-file-grp USE       File group(s) used as output
271
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
272
  --overwrite                     Remove existing output pages/images
273
                                  (with "--page-id", remove only those)
274
  --profile                       Enable profiling
275
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
276
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
277
                                  or JSON file path
278
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
279
                                  taking precedence over --parameter
280
  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
281
                                  If URL starts with http:// start an HTTP server there,
282
                                  otherwise URL is a path to an on-demand-created unix socket
283
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
284
                                  Override log level globally [INFO]
285
'''
286
287
    information_options = '''\
288
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
289
  -L, --list-resources            List names of processor resources
290
  -J, --dump-json                 Dump tool description as JSON
291
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
292
  -h, --help                      Show this message
293
  -V, --version                   Show version
294
'''
295
296
    parameter_help = ''
297
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
298
        parameter_help = '  NONE\n'
299
    else:
300
        def wrap(s):
301
            return wrap_text(s, initial_indent=' '*3,
302
                             subsequent_indent=' '*4,
303
                             width=72, preserve_paragraphs=True)
304
        for param_name, param in ocrd_tool['parameters'].items():
305
            parameter_help += wrap('"%s" [%s%s]' % (
306
                param_name,
307
                param['type'],
308
                ' - REQUIRED' if 'required' in param and param['required'] else
309
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
310
            parameter_help += '\n ' + wrap(param['description'])
311
            if 'enum' in param:
312
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
313
            parameter_help += "\n"
314
315
    if not subcommand:
316
        return f'''\
317
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
318
319
  {ocrd_tool['description']}{doc_help}
320
321
Subcommands:
322
{subcommands}
323
Options for processing:
324
{processing_options}
325
Options for information:
326
{information_options}
327
Parameters:
328
{parameter_help}
329
'''
330
    elif subcommand == 'worker':
331
        return f'''\
332
Usage: {ocrd_tool['executable']} worker [OPTIONS]
333
334
  Run {ocrd_tool['executable']} as a processing worker.
335
336
  {ocrd_tool['description']}{doc_help}
337
338
Options:
339
{processing_worker_options}
340
'''
341
    elif subcommand == 'server':
342
        return f'''\
343
Usage: {ocrd_tool['executable']} server [OPTIONS]
344
345
  Run {ocrd_tool['executable']} as a processor sever.
346
347
  {ocrd_tool['description']}{doc_help}
348
349
Options:
350
{processing_server_options}
351
'''
352
    else:
353
        pass
354
355
356
# Taken from https://github.com/OCR-D/core/pull/884
357
@freeze_args
358
@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
359
def get_cached_processor(parameter: dict, processor_class):
360
    """
361
    Call this function to get back an instance of a processor.
362
    The results are cached based on the parameters.
363
    Args:
364
        parameter (dict): a dictionary of parameters.
365
        processor_class: the concrete `:py:class:~ocrd.Processor` class.
366
    Returns:
367
        When the concrete class of the processor is unknown, `None` is returned.
368
        Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
369
    """
370
    if processor_class:
371
        dict_params = dict(parameter) if parameter else None
372
        return processor_class(workspace=None, parameter=dict_params)
373
    return None
374
375
376
def get_processor(
377
        processor_class,
378
        parameter: dict,
379
        workspace: Workspace = None,
380
        page_id: str = None,
381
        input_file_grp: List[str] = None,
382
        output_file_grp: List[str] = None,
383
        instance_caching: bool = False,
384
):
385
    if processor_class:
386
        if instance_caching:
387
            cached_processor = get_cached_processor(
388
                parameter=parameter,
389
                processor_class=processor_class
390
            )
391
            cached_processor.workspace = workspace
392
            cached_processor.page_id = page_id
393
            cached_processor.input_file_grp = input_file_grp
394
            cached_processor.output_file_grp = output_file_grp
395
            return cached_processor
396
        return processor_class(
397
            workspace=workspace,
398
            page_id=page_id,
399
            input_file_grp=input_file_grp,
400
            output_file_grp=output_file_grp,
401
            parameter=parameter
402
        )
403
    raise ValueError("Processor class is not known")
404