Passed
Pull Request — master (#966)
by Konstantin
02:30
created

ocrd.processor.helpers.run_processor()   C

Complexity

Conditions 5

Size

Total Lines 125
Code Lines 87

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 87
dl 0
loc 125
rs 6.9587
c 0
b 0
f 0
cc 5
nop 15

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Helper methods for running and documenting processors
3
"""
4
from os import chdir, environ, getcwd
5
from time import perf_counter, process_time
6
from functools import lru_cache
7
import json
8
import inspect
9
from subprocess import run, PIPE
10
from typing import List, Type
11
12
from memory_profiler import memory_usage
13
from sparklines import sparklines
14
15
from click import wrap_text
16
from ocrd.workspace import Workspace
17
from ocrd_utils import freeze_args, getLogger, pushd_popd
18
19
20
__all__ = [
21
    'generate_processor_help',
22
    'run_cli',
23
    'run_processor'
24
]
25
26
27
def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None, mets_server_url=None):
28
    if workspace is None:
29
        if resolver is None:
30
            raise Exception("Need to pass a resolver to create a workspace")
31
        if mets_url is None:
32
            raise Exception("Need to pass mets_url to create a workspace")
33
        workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url)
34
    return workspace
35
36
def run_processor(
37
        processorClass,
38
        mets_url=None,
39
        resolver=None,
40
        workspace=None,
41
        page_id=None,
42
        log_level=None,         # TODO actually use this!
43
        input_file_grp=None,
44
        output_file_grp=None,
45
        show_resource=None,
46
        list_resources=False,
47
        parameter=None,
48
        parameter_override=None,
49
        working_dir=None,
50
        mets_server_url=None,
51
        instance_caching=False  # TODO don't set this yet!
52
): # pylint: disable=too-many-locals
53
    """
54
    Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
55
56
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
57
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
58
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
59
60
    Instantiate a Python object for :py:attr:`processorClass`, passing:
61
    - the workspace,
62
    - :py:attr:`page_id`
63
    - :py:attr:`input_file_grp`
64
    - :py:attr:`output_file_grp`
65
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
66
67
    Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects.
68
    This flag is used for an experimental feature we would like to adopt in future.
69
70
    Run the processor on the workspace (creating output files in the filesystem).
71
72
    Finally, write back the workspace (updating the METS in the filesystem).
73
74
    Args:
75
        processorClass (object): Python class of the module processor.
76
    """
77
    workspace = _get_workspace(
78
        workspace,
79
        resolver,
80
        mets_url,
81
        working_dir,
82
        mets_server_url
83
    )
84
    log = getLogger('ocrd.processor.helpers.run_processor')
85
    log.debug("Running processor %s", processorClass)
86
87
    old_cwd = getcwd()
88
    processor = get_processor(
89
        processor_class=processorClass,
90
        parameter=parameter,
91
        workspace=None,
92
        page_id=page_id,
93
        input_file_grp=input_file_grp,
94
        output_file_grp=output_file_grp,
95
        instance_caching=instance_caching
96
    )
97
    processor.workspace = workspace
98
    chdir(processor.workspace.directory)
99
100
    ocrd_tool = processor.ocrd_tool
101
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
102
    otherrole = ocrd_tool['steps'][0]
103
    logProfile = getLogger('ocrd.process.profile')
104
    log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
105
    t0_wall = perf_counter()
106
    t0_cpu = process_time()
107
    if any(x in environ.get('OCRD_PROFILE', '') for x in ['RSS', 'PSS']):
108
        backend = 'psutil_pss' if 'PSS' in environ['OCRD_PROFILE'] else 'psutil'
109
        try:
110
            mem_usage = memory_usage(proc=processor.process,
111
                                     # only run process once
112
                                     max_iterations=1,
113
                                     interval=.1, timeout=None, timestamps=True,
114
                                     # include sub-processes
115
                                     multiprocess=True, include_children=True,
116
                                     # get proportional set size instead of RSS
117
                                     backend=backend)
118
        except Exception as err:
119
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
120
            raise err
121
        finally:
122
            chdir(old_cwd)
123
        mem_usage_values = [mem for mem, _ in mem_usage]
124
        mem_output = 'memory consumption: '
125
        mem_output += ''.join(sparklines(mem_usage_values))
126
        mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values))
127
        logProfile.info(mem_output)
128
    else:
129
        try:
130
            processor.process()
131
        except Exception as err:
132
            log.exception("Failure in processor '%s'" % ocrd_tool['executable'])
133
            raise err
134
        finally:
135
            chdir(old_cwd)
136
137
    t1_wall = perf_counter() - t0_wall
138
    t1_cpu = process_time() - t0_cpu
139
    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
140
        ocrd_tool['executable'],
141
        t1_wall,
142
        t1_cpu,
143
        processor.input_file_grp or '',
144
        processor.output_file_grp or '',
145
        json.dumps(processor.parameter) or '',
146
        processor.page_id or ''
147
    ))
148
    workspace.mets.add_agent(
149
        name=name,
150
        _type='OTHER',
151
        othertype='SOFTWARE',
152
        role='OTHER',
153
        otherrole=otherrole,
154
        notes=[({'option': 'input-file-grp'}, processor.input_file_grp or ''),
155
               ({'option': 'output-file-grp'}, processor.output_file_grp or ''),
156
               ({'option': 'parameter'}, json.dumps(processor.parameter or '')),
157
               ({'option': 'page-id'}, processor.page_id or '')]
158
    )
159
    workspace.save_mets()
160
    return processor
161
162
163
def run_cli(
164
        executable,
165
        mets_url=None,
166
        resolver=None,
167
        workspace=None,
168
        page_id=None,
169
        overwrite=None,
170
        log_level=None,
171
        input_file_grp=None,
172
        output_file_grp=None,
173
        parameter=None,
174
        working_dir=None,
175
):
176
    """
177
    Open a workspace and run a processor on the command line.
178
179
    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
180
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
181
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).
182
183
    Run the processor CLI :py:attr:`executable` on the workspace, passing:
184
    - the workspace,
185
    - :py:attr:`page_id`
186
    - :py:attr:`input_file_grp`
187
    - :py:attr:`output_file_grp`
188
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)
189
190
    (Will create output files and update the in the filesystem).
191
192
    Args:
193
        executable (string): Executable name of the module processor.
194
    """
195
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
196
    args = [executable, '--working-dir', workspace.directory]
197
    args += ['--mets', mets_url]
198
    if log_level:
199
        args += ['--log-level', log_level]
200
    if page_id:
201
        args += ['--page-id', page_id]
202
    if input_file_grp:
203
        args += ['--input-file-grp', input_file_grp]
204
    if output_file_grp:
205
        args += ['--output-file-grp', output_file_grp]
206
    if parameter:
207
        args += ['--parameter', parameter]
208
    if overwrite:
209
        args += ['--overwrite']
210
    log = getLogger('ocrd.processor.helpers.run_cli')
211
    log.debug("Running subprocess '%s'", ' '.join(args))
212
    result = run(args, check=False)
213
    return result.returncode
214
215
def generate_processor_help(ocrd_tool, processor_instance=None):
216
    """Generate a string describing the full CLI of this processor including params.
217
218
    Args:
219
         ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
220
         processor_instance (object, optional): the processor implementation
221
             (for adding any module/class/function docstrings)
222
    """
223
    parameter_help = ''
224
    if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
225
        parameter_help = '  NONE\n'
226
    else:
227
        def wrap(s):
228
            return wrap_text(s, initial_indent=' '*3,
229
                             subsequent_indent=' '*4,
230
                             width=72, preserve_paragraphs=True)
231
        for param_name, param in ocrd_tool['parameters'].items():
232
            parameter_help += wrap('"%s" [%s%s]' % (
233
                param_name,
234
                param['type'],
235
                ' - REQUIRED' if 'required' in param and param['required'] else
236
                ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
237
            parameter_help += '\n ' + wrap(param['description'])
238
            if 'enum' in param:
239
                parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
240
            parameter_help += "\n"
241
    doc_help = ''
242
    if processor_instance:
243
        module = inspect.getmodule(processor_instance)
244
        if module and module.__doc__:
245
            doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
246
        if processor_instance.__doc__:
247
            doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
248
        if processor_instance.process.__doc__:
249
            doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
250
        if doc_help:
251
            doc_help = '\n\n' + wrap_text(doc_help, width=72,
252
                                          initial_indent='  > ',
253
                                          subsequent_indent='  > ',
254
                                          preserve_paragraphs=True)
255
    return '''
256
Usage: %s [OPTIONS]
257
258
  %s%s
259
260
Options for processing:
261
  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
262
  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
263
  -I, --input-file-grp USE        File group(s) used as input
264
  -O, --output-file-grp USE       File group(s) used as output
265
  -g, --page-id ID                Physical page ID(s) to process instead of full document []
266
  --overwrite                     Remove existing output pages/images
267
                                  (with "--page-id", remove only those)
268
  --profile                       Enable profiling
269
  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
270
  -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
271
                                  or JSON file path
272
  -P, --param-override KEY VAL    Override a single JSON object key-value pair,
273
                                  taking precedence over --parameter
274
  -m, --mets URL-PATH             URL or file path of METS to process
275
  -U, --mets-server-url URL           URL of a METS Server for parallel incremental access to METS
276
                                  If URL starts with http:// start an HTTP server there,
277
                                  otherwise URL is a path to an on-demand-created unix socket
278
  -w, --working-dir PATH          Working directory of local workspace
279
  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
280
                                  Override log level globally [INFO]
281
282
Options for Processing Worker server:
283
  --queue                         The RabbitMQ server address in format
284
                                  "amqp://{user}:{pass}@{host}:{port}/{vhost}"
285
                                  [amqp://admin:admin@localhost:5672]
286
  --database                      The MongoDB server address in format
287
                                  "mongodb://{host}:{port}"
288
                                  [mongodb://localhost:27018]
289
  --type                          type of processing: either "worker" or "server"
290
291
Options for information:
292
  -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
293
  -L, --list-resources            List names of processor resources
294
  -J, --dump-json                 Dump tool description as JSON
295
  -D, --dump-module-dir           Show the 'module' resource location path for this processor
296
  -h, --help                      Show this message
297
  -V, --version                   Show version
298
299
Parameters:
300
%s
301
302
''' % (
303
    ocrd_tool['executable'],
304
    ocrd_tool['description'],
305
    doc_help,
306
    parameter_help,
307
)
308
309
310
# Taken from https://github.com/OCR-D/core/pull/884
311
@freeze_args
312
@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128))
313
def get_cached_processor(parameter: dict, processor_class):
314
    """
315
    Call this function to get back an instance of a processor.
316
    The results are cached based on the parameters.
317
    Args:
318
        parameter (dict): a dictionary of parameters.
319
        processor_class: the concrete `:py:class:~ocrd.Processor` class.
320
    Returns:
321
        When the concrete class of the processor is unknown, `None` is returned.
322
        Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
323
    """
324
    if processor_class:
325
        dict_params = dict(parameter) if parameter else None
326
        return processor_class(workspace=None, parameter=dict_params)
327
    return None
328
329
330
def get_processor(
331
        processor_class,
332
        parameter: dict,
333
        workspace: Workspace = None,
334
        page_id: str = None,
335
        input_file_grp: List[str] = None,
336
        output_file_grp: List[str] = None,
337
        instance_caching: bool = False,
338
):
339
    if processor_class:
340
        if instance_caching:
341
            cached_processor = get_cached_processor(
342
                parameter=parameter,
343
                processor_class=processor_class
344
            )
345
            cached_processor.workspace = workspace
346
            cached_processor.page_id = page_id
347
            cached_processor.input_file_grp = input_file_grp
348
            cached_processor.output_file_grp = output_file_grp
349
            return cached_processor
350
        return processor_class(
351
            workspace=workspace,
352
            page_id=page_id,
353
            input_file_grp=input_file_grp,
354
            output_file_grp=output_file_grp,
355
            parameter=parameter
356
        )
357
    raise ValueError("Processor class is not known")
358