Passed
Pull Request — master (#966)
by Konstantin
02:35
created

ocrd.processor.helpers.run_processor()   C

Complexity

Conditions 5

Size

Total Lines 126
Code Lines 88

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 88
dl 0
loc 126
rs 6.926
c 0
b 0
f 0
cc 5
nop 17

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Helper methods for running and documenting processors
3
"""
4
from os import chdir, environ, getcwd
5
from time import perf_counter, process_time
6
from functools import lru_cache
7
import json
8
import inspect
9
from subprocess import run, PIPE
10
from typing import List, Type
11
12
from memory_profiler import memory_usage
13
from sparklines import sparklines
14
15
from click import wrap_text
16
from ocrd.workspace import Workspace
17
from ocrd_utils import freeze_args, getLogger, pushd_popd
18
19
20
__all__ = [
21
    'generate_processor_help',
22
    'run_cli',
23
    'run_processor'
24
]
25
26
27
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
28
    if workspace is None:
29
        if resolver is None:
30
            raise Exception("Need to pass a resolver to create a workspace")
31
        if mets_url is None:
32
            raise Exception("Need to pass mets_url to create a workspace")
33
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
34
    return workspace
35
36
def run_processor(
37
        processorClass,
38
        mets_url=None,
39
        resolver=None,
40
        workspace=None,
41
        page_id=None,
42
        log_level=None,         # TODO actually use this!
43
        input_file_grp=None,
44
        output_file_grp=None,
45
        show_resource=None,
46
        list_resources=False,
47
        parameter=None,
48
        parameter_override=None,
49
        working_dir=None,
50
        mets_server_host=None,
51
        mets_server_port=None,
52
        mets_server_socket=None,
53
        instance_caching=False  # TODO don't set this yet!
54
): # pylint: disable=too-many-locals
55
    """
56
    Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
57
58
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
59
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
60
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
61
62
    Instantiate a Python object for :py:attr:`processorClass`, passing:
63
    - the workspace,
64
    - :py:attr:`page_id`
65
    - :py:attr:`input_file_grp`
66
    - :py:attr:`output_file_grp`
67
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
68
69
    Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects.
70
    This flag is used for an experimental feature we would like to adopt in future.
71
72
    Run the processor on the workspace (creating output files in the filesystem).
73
74
    Finally, write back the workspace (updating the METS in the filesystem).
75
76
    Args:
77
        processorClass (object): Python class of the module processor.
78
    """
79
    workspace = _get_workspace(
80
        workspace,
81
        resolver,
82
        mets_url,
83
        working_dir
84
    )
85
    log = getLogger('ocrd.processor.helpers.run_processor')
86
    log.debug("Running processor %s", processorClass)
87
88
    old_cwd = getcwd()
89
    processor = get_processor(
90
        processor_class=processorClass,
91
        parameter=parameter,
92
        workspace=None,
93
        page_id=page_id,
94
        input_file_grp=input_file_grp,
95
        output_file_grp=output_file_grp,
96
        instance_caching=instance_caching
97
    )
98
    processor.workspace = workspace
99
    chdir(processor.workspace.directory)
100
101
    ocrd_tool = processor.ocrd_tool
102
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
103
    otherrole = ocrd_tool['steps'][0]
104
    logProfile = getLogger('ocrd.process.profile')
105
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
106
    t0_wall = perf_counter()
107
    t0_cpu = process_time()
108
    if any(x in environ.get('OCRD_PROFILE', '') for x in ['RSS', 'PSS']):
109
        backend = 'psutil_pss' if 'PSS' in environ['OCRD_PROFILE'] else 'psutil'
110
        try:
111
            mem_usage = memory_usage(proc=processor.process,
112
                                     # only run process once
113
                                     max_iterations=1,
114
                                     interval=.1, timeout=None, timestamps=True,
115
                                     # include sub-processes
116
                                     multiprocess=True, include_children=True,
117
                                     # get proportional set size instead of RSS
118
                                     backend=backend)
119
        except Exception as err:
120
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
121
            raise err
122
        finally:
123
            chdir(old_cwd)
124
        mem_usage_values = [mem for mem, _ in mem_usage]
125
        mem_output = 'memory consumption: '
126
        mem_output += ''.join(sparklines(mem_usage_values))
127
        mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values))
128
        logProfile.info(mem_output)
129
    else:
130
        try:
131
            processor.process()
132
        except Exception as err:
133
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
134
            raise err
135
        finally:
136
            chdir(old_cwd)
137
138
    t1_wall = perf_counter() - t0_wall
139
    t1_cpu = process_time() - t0_cpu
140
    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
141
        ocrd_tool['executable'],
142
        t1_wall,
143
        t1_cpu,
144
        processor.input_file_grp or '',
145
        processor.output_file_grp or '',
146
        json.dumps(processor.parameter) or '',
147
        processor.page_id or ''
148
    ))
149
    workspace.mets.add_agent(
150
        name=name,
151
        _type='OTHER',
152
        othertype='SOFTWARE',
153
        role='OTHER',
154
        otherrole=otherrole,
155
        notes=[({'option': 'input-file-grp'}, processor.input_file_grp or ''),
156
               ({'option': 'output-file-grp'}, processor.output_file_grp or ''),
157
               ({'option': 'parameter'}, json.dumps(processor.parameter or '')),
158
               ({'option': 'page-id'}, processor.page_id or '')]
159
    )
160
    workspace.save_mets()
161
    return processor
162
163
164
def run_cli(
165
        executable,
166
        mets_url=None,
167
        resolver=None,
168
        workspace=None,
169
        page_id=None,
170
        overwrite=None,
171
        log_level=None,
172
        input_file_grp=None,
173
        output_file_grp=None,
174
        parameter=None,
175
        working_dir=None,
176
):
177
    """
178
    Open a workspace and run a processor on the command line.
179
180
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
181
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
182
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
183
184
    Run the processor CLI :py:attr:`executable` on the workspace, passing:
185
    - the workspace,
186
    - :py:attr:`page_id`
187
    - :py:attr:`input_file_grp`
188
    - :py:attr:`output_file_grp`
189
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
190
191
    (Will create output files and update the in the filesystem).
192
193
    Args:
194
        executable (string): Executable name of the module processor.
195
    """
196
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
197
    args = [executable, '--working-dir', workspace.directory]
198
    args += ['--mets', mets_url]
199
    if log_level:
200
        args += ['--log-level', log_level]
201
    if page_id:
202
        args += ['--page-id', page_id]
203
    if input_file_grp:
204
        args += ['--input-file-grp', input_file_grp]
205
    if output_file_grp:
206
        args += ['--output-file-grp', output_file_grp]
207
    if parameter:
208
        args += ['--parameter', parameter]
209
    if overwrite:
210
        args += ['--overwrite']
211
    log = getLogger('ocrd.processor.helpers.run_cli')
212
    log.debug("Running subprocess '%s'", ' '.join(args))
213
    result = run(args, check=False)
214
    return result.returncode
215
216
def generate_processor_help(ocrd_tool, processor_instance=None):
217
    """Generate a string describing the full CLI of this processor including params.
218
219
    Args:
220
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
221
         processor_instance (object, optional): the processor implementation
222
             (for adding any module/class/function docstrings)
223
    """
224
    parameter_help = ''
225
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
226
        parameter_help = '  NONE\n'
227
    else:
228
        def wrap(s):
229
            return wrap_text(s, initial_indent=' '*3,
230
                             subsequent_indent=' '*4,
231
                             width=72, preserve_paragraphs=True)
232
        for param_name, param in ocrd_tool['parameters'].items():
233
            parameter_help += wrap('"%s" [%s%s]' % (
234
                param_name,
235
                param['type'],
236
                ' - REQUIRED' if 'required' in param and param['required'] else
237
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
238
            parameter_help += '\n ' + wrap(param['description'])
239
            if 'enum' in param:
240
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
241
            parameter_help += "\n"
242
    doc_help = ''
243
    if processor_instance:
244
        module = inspect.getmodule(processor_instance)
245
        if module and module.__doc__:
246
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
247
        if processor_instance.__doc__:
248
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
249
        if processor_instance.process.__doc__:
250
            doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
251
        if doc_help:
252
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
253
                                          initial_indent='  > ',
254
                                          subsequent_indent='  > ',
255
                                          preserve_paragraphs=True)
256
    return '''
257
Usage: %s [OPTIONS]
258
259
  %s%s
260
261
Options for processing:
262
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
263
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
264
  -I, --input-file-grp USE        File group(s) used as input
265
  -O, --output-file-grp USE       File group(s) used as output
266
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
267
  --overwrite                     Remove existing output pages/images
268
                                  (with "--page-id", remove only those)
269
  --profile                       Enable profiling
270
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
271
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
272
                                  or JSON file path
273
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
274
                                  taking precedence over --parameter
275
  -m, --mets URL-PATH             URL or file path of METS to process
276
  --mets-server-url URL           URL of a METS Server for parallel incremental access to METS
277
  -w, --working-dir PATH          Working directory of local workspace
278
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
279
                                  Override log level globally [INFO]
280
281
Options for Processing Worker server:
282
  --queue                         The RabbitMQ server address in format
283
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
284
                                  [amqp://admin:admin@localhost:5672]
285
  --database                      The MongoDB server address in format
286
                                  "mongodb://{host}:{port}"
287
                                  [mongodb://localhost:27018]
288
  --type                          type of processing: either "worker" or "server"
289
290
Options for information:
291
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
292
  -L, --list-resources            List names of processor resources
293
  -J, --dump-json                 Dump tool description as JSON
294
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
295
  -h, --help                      Show this message
296
  -V, --version                   Show version
297
298
Parameters:
299
%s
300
301
''' % (
302
    ocrd_tool['executable'],
303
    ocrd_tool['description'],
304
    doc_help,
305
    parameter_help,
306
)
307
308
309
# Taken from https://github.com/OCR-D/core/pull/884
310
@freeze_args
311
@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128))
312
def get_cached_processor(parameter: dict, processor_class):
313
    """
314
    Call this function to get back an instance of a processor.
315
    The results are cached based on the parameters.
316
    Args:
317
        parameter (dict): a dictionary of parameters.
318
        processor_class: the concrete `:py:class:~ocrd.Processor` class.
319
    Returns:
320
        When the concrete class of the processor is unknown, `None` is returned.
321
        Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
322
    """
323
    if processor_class:
324
        dict_params = dict(parameter) if parameter else None
325
        return processor_class(workspace=None, parameter=dict_params)
326
    return None
327
328
329
def get_processor(
330
        processor_class,
331
        parameter: dict,
332
        workspace: Workspace = None,
333
        page_id: str = None,
334
        input_file_grp: List[str] = None,
335
        output_file_grp: List[str] = None,
336
        instance_caching: bool = False,
337
):
338
    if processor_class:
339
        if instance_caching:
340
            cached_processor = get_cached_processor(
341
                parameter=parameter,
342
                processor_class=processor_class
343
            )
344
            cached_processor.workspace = workspace
345
            cached_processor.page_id = page_id
346
            cached_processor.input_file_grp = input_file_grp
347
            cached_processor.output_file_grp = output_file_grp
348
            return cached_processor
349
        return processor_class(
350
            workspace=workspace,
351
            page_id=page_id,
352
            input_file_grp=input_file_grp,
353
            output_file_grp=output_file_grp,
354
            parameter=parameter
355
        )
356
    raise ValueError("Processor class is not known")
357