Passed
Pull Request — master (#966)
by Konstantin
02:36
created

ocrd.processor.helpers.run_processor()   C

Complexity

Conditions 5

Size

Total Lines 124
Code Lines 86

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 86
dl 0
loc 124
rs 6.9915
c 0
b 0
f 0
cc 5
nop 15

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Helper methods for running and documenting processors
3
"""
4
from os import chdir, environ, getcwd
5
from time import perf_counter, process_time
6
from functools import lru_cache
7
import json
8
import inspect
9
from subprocess import run, PIPE
10
from typing import List, Type
11
12
from memory_profiler import memory_usage
13
from sparklines import sparklines
14
15
from click import wrap_text
16
from ocrd.workspace import Workspace
17
from ocrd_utils import freeze_args, getLogger, pushd_popd
18
19
20
__all__ = [
21
    'generate_processor_help',
22
    'run_cli',
23
    'run_processor'
24
]
25
26
27
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
28
    if workspace is None:
29
        if resolver is None:
30
            raise Exception("Need to pass a resolver to create a workspace")
31
        if mets_url is None:
32
            raise Exception("Need to pass mets_url to create a workspace")
33
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
34
    return workspace
35
36
def run_processor(
37
        processorClass,
38
        mets_url=None,
39
        resolver=None,
40
        workspace=None,
41
        page_id=None,
42
        log_level=None,         # TODO actually use this!
43
        input_file_grp=None,
44
        output_file_grp=None,
45
        show_resource=None,
46
        list_resources=False,
47
        parameter=None,
48
        parameter_override=None,
49
        working_dir=None,
50
        mets_server_url=None,
51
        instance_caching=False  # TODO don't set this yet!
52
): # pylint: disable=too-many-locals
53
    """
54
    Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
55
56
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
57
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
58
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
59
60
    Instantiate a Python object for :py:attr:`processorClass`, passing:
61
    - the workspace,
62
    - :py:attr:`page_id`
63
    - :py:attr:`input_file_grp`
64
    - :py:attr:`output_file_grp`
65
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
66
67
    Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects.
68
    This flag is used for an experimental feature we would like to adopt in future.
69
70
    Run the processor on the workspace (creating output files in the filesystem).
71
72
    Finally, write back the workspace (updating the METS in the filesystem).
73
74
    Args:
75
        processorClass (object): Python class of the module processor.
76
    """
77
    workspace = _get_workspace(
78
        workspace,
79
        resolver,
80
        mets_url,
81
        working_dir
82
    )
83
    log = getLogger('ocrd.processor.helpers.run_processor')
84
    log.debug("Running processor %s", processorClass)
85
86
    old_cwd = getcwd()
87
    processor = get_processor(
88
        processor_class=processorClass,
89
        parameter=parameter,
90
        workspace=None,
91
        page_id=page_id,
92
        input_file_grp=input_file_grp,
93
        output_file_grp=output_file_grp,
94
        instance_caching=instance_caching
95
    )
96
    processor.workspace = workspace
97
    chdir(processor.workspace.directory)
98
99
    ocrd_tool = processor.ocrd_tool
100
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
101
    otherrole = ocrd_tool['steps'][0]
102
    logProfile = getLogger('ocrd.process.profile')
103
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
104
    t0_wall = perf_counter()
105
    t0_cpu = process_time()
106
    if any(x in environ.get('OCRD_PROFILE', '') for x in ['RSS', 'PSS']):
107
        backend = 'psutil_pss' if 'PSS' in environ['OCRD_PROFILE'] else 'psutil'
108
        try:
109
            mem_usage = memory_usage(proc=processor.process,
110
                                     # only run process once
111
                                     max_iterations=1,
112
                                     interval=.1, timeout=None, timestamps=True,
113
                                     # include sub-processes
114
                                     multiprocess=True, include_children=True,
115
                                     # get proportional set size instead of RSS
116
                                     backend=backend)
117
        except Exception as err:
118
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
119
            raise err
120
        finally:
121
            chdir(old_cwd)
122
        mem_usage_values = [mem for mem, _ in mem_usage]
123
        mem_output = 'memory consumption: '
124
        mem_output += ''.join(sparklines(mem_usage_values))
125
        mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values))
126
        logProfile.info(mem_output)
127
    else:
128
        try:
129
            processor.process()
130
        except Exception as err:
131
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
132
            raise err
133
        finally:
134
            chdir(old_cwd)
135
136
    t1_wall = perf_counter() - t0_wall
137
    t1_cpu = process_time() - t0_cpu
138
    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
139
        ocrd_tool['executable'],
140
        t1_wall,
141
        t1_cpu,
142
        processor.input_file_grp or '',
143
        processor.output_file_grp or '',
144
        json.dumps(processor.parameter) or '',
145
        processor.page_id or ''
146
    ))
147
    workspace.mets.add_agent(
148
        name=name,
149
        _type='OTHER',
150
        othertype='SOFTWARE',
151
        role='OTHER',
152
        otherrole=otherrole,
153
        notes=[({'option': 'input-file-grp'}, processor.input_file_grp or ''),
154
               ({'option': 'output-file-grp'}, processor.output_file_grp or ''),
155
               ({'option': 'parameter'}, json.dumps(processor.parameter or '')),
156
               ({'option': 'page-id'}, processor.page_id or '')]
157
    )
158
    workspace.save_mets()
159
    return processor
160
161
162
def run_cli(
163
        executable,
164
        mets_url=None,
165
        resolver=None,
166
        workspace=None,
167
        page_id=None,
168
        overwrite=None,
169
        log_level=None,
170
        input_file_grp=None,
171
        output_file_grp=None,
172
        parameter=None,
173
        working_dir=None,
174
):
175
    """
176
    Open a workspace and run a processor on the command line.
177
178
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
179
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
180
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
181
182
    Run the processor CLI :py:attr:`executable` on the workspace, passing:
183
    - the workspace,
184
    - :py:attr:`page_id`
185
    - :py:attr:`input_file_grp`
186
    - :py:attr:`output_file_grp`
187
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
188
189
    (Will create output files and update the in the filesystem).
190
191
    Args:
192
        executable (string): Executable name of the module processor.
193
    """
194
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
195
    args = [executable, '--working-dir', workspace.directory]
196
    args += ['--mets', mets_url]
197
    if log_level:
198
        args += ['--log-level', log_level]
199
    if page_id:
200
        args += ['--page-id', page_id]
201
    if input_file_grp:
202
        args += ['--input-file-grp', input_file_grp]
203
    if output_file_grp:
204
        args += ['--output-file-grp', output_file_grp]
205
    if parameter:
206
        args += ['--parameter', parameter]
207
    if overwrite:
208
        args += ['--overwrite']
209
    log = getLogger('ocrd.processor.helpers.run_cli')
210
    log.debug("Running subprocess '%s'", ' '.join(args))
211
    result = run(args, check=False)
212
    return result.returncode
213
214
def generate_processor_help(ocrd_tool, processor_instance=None):
215
    """Generate a string describing the full CLI of this processor including params.
216
217
    Args:
218
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
219
         processor_instance (object, optional): the processor implementation
220
             (for adding any module/class/function docstrings)
221
    """
222
    parameter_help = ''
223
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
224
        parameter_help = '  NONE\n'
225
    else:
226
        def wrap(s):
227
            return wrap_text(s, initial_indent=' '*3,
228
                             subsequent_indent=' '*4,
229
                             width=72, preserve_paragraphs=True)
230
        for param_name, param in ocrd_tool['parameters'].items():
231
            parameter_help += wrap('"%s" [%s%s]' % (
232
                param_name,
233
                param['type'],
234
                ' - REQUIRED' if 'required' in param and param['required'] else
235
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
236
            parameter_help += '\n ' + wrap(param['description'])
237
            if 'enum' in param:
238
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
239
            parameter_help += "\n"
240
    doc_help = ''
241
    if processor_instance:
242
        module = inspect.getmodule(processor_instance)
243
        if module and module.__doc__:
244
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
245
        if processor_instance.__doc__:
246
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
247
        if processor_instance.process.__doc__:
248
            doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
249
        if doc_help:
250
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
251
                                          initial_indent='  > ',
252
                                          subsequent_indent='  > ',
253
                                          preserve_paragraphs=True)
254
    return '''
255
Usage: %s [OPTIONS]
256
257
  %s%s
258
259
Options for processing:
260
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
261
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
262
  -I, --input-file-grp USE        File group(s) used as input
263
  -O, --output-file-grp USE       File group(s) used as output
264
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
265
  --overwrite                     Remove existing output pages/images
266
                                  (with "--page-id", remove only those)
267
  --profile                       Enable profiling
268
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
269
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
270
                                  or JSON file path
271
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
272
                                  taking precedence over --parameter
273
  -m, --mets URL-PATH             URL or file path of METS to process
274
  -U, --mets-server-url URL           URL of a METS Server for parallel incremental access to METS
275
                                  If URL starts with http:// start an HTTP server there,
276
                                  otherwise URL is a path to an on-demand-created unix socket
277
  -w, --working-dir PATH          Working directory of local workspace
278
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
279
                                  Override log level globally [INFO]
280
281
Options for Processing Worker server:
282
  --queue                         The RabbitMQ server address in format
283
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
284
                                  [amqp://admin:admin@localhost:5672]
285
  --database                      The MongoDB server address in format
286
                                  "mongodb://{host}:{port}"
287
                                  [mongodb://localhost:27018]
288
  --type                          type of processing: either "worker" or "server"
289
290
Options for information:
291
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
292
  -L, --list-resources            List names of processor resources
293
  -J, --dump-json                 Dump tool description as JSON
294
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
295
  -h, --help                      Show this message
296
  -V, --version                   Show version
297
298
Parameters:
299
%s
300
301
''' % (
302
    ocrd_tool['executable'],
303
    ocrd_tool['description'],
304
    doc_help,
305
    parameter_help,
306
)
307
308
309
# Taken from https://github.com/OCR-D/core/pull/884
310
@freeze_args
311
@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128))
312
def get_cached_processor(parameter: dict, processor_class):
313
    """
314
    Call this function to get back an instance of a processor.
315
    The results are cached based on the parameters.
316
    Args:
317
        parameter (dict): a dictionary of parameters.
318
        processor_class: the concrete `:py:class:~ocrd.Processor` class.
319
    Returns:
320
        When the concrete class of the processor is unknown, `None` is returned.
321
        Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
322
    """
323
    if processor_class:
324
        dict_params = dict(parameter) if parameter else None
325
        return processor_class(workspace=None, parameter=dict_params)
326
    return None
327
328
329
def get_processor(
330
        processor_class,
331
        parameter: dict,
332
        workspace: Workspace = None,
333
        page_id: str = None,
334
        input_file_grp: List[str] = None,
335
        output_file_grp: List[str] = None,
336
        instance_caching: bool = False,
337
):
338
    if processor_class:
339
        if instance_caching:
340
            cached_processor = get_cached_processor(
341
                parameter=parameter,
342
                processor_class=processor_class
343
            )
344
            cached_processor.workspace = workspace
345
            cached_processor.page_id = page_id
346
            cached_processor.input_file_grp = input_file_grp
347
            cached_processor.output_file_grp = output_file_grp
348
            return cached_processor
349
        return processor_class(
350
            workspace=workspace,
351
            page_id=page_id,
352
            input_file_grp=input_file_grp,
353
            output_file_grp=output_file_grp,
354
            parameter=parameter
355
        )
356
    raise ValueError("Processor class is not known")
357