Passed
Pull Request — master (#559)
by Konstantin
01:57
created

ocrd.processor.base.Processor.resolve_resource()   C

Complexity

Conditions 11

Size

Total Lines 37
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 28
dl 0
loc 37
rs 5.4
c 0
b 0
f 0
cc 11
nop 3

How to fix   Complexity   

Complexity

Complex classes like ocrd.processor.base.Processor.resolve_resource() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
Processor base class and helper functions
3
"""
4
5
__all__ = [
6
    'Processor',
7
    'generate_processor_help',
8
    'run_cli',
9
    'run_processor'
10
]
11
12
from os import makedirs
13
from os.path import exists, isdir, join
14
from shutil import copyfileobj
15
import json
16
import os
17
import re
18
from pkg_resources import resource_filename
19
20
import requests
21
22
from ocrd_utils import (
23
    VERSION as OCRD_VERSION,
24
    MIMETYPE_PAGE,
25
    list_resource_candidates,
26
    list_all_resources,
27
    XDG_CACHE_HOME
28
)
29
from ocrd_validators import ParameterValidator
30
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
31
32
# XXX imports must remain for backwards-compatibilty
33
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
34
35
class Processor():
36
    """
37
    A processor runs an algorithm based on the workspace, the mets.xml in the
38
    workspace (and the input files defined therein) as well as optional
39
    parameter.
40
    """
41
42
    def __init__(
43
            self,
44
            workspace,
45
            ocrd_tool=None,
46
            parameter=None,
47
            # TODO OCR-D/core#274
48
            # input_file_grp=None,
49
            # output_file_grp=None,
50
            input_file_grp="INPUT",
51
            output_file_grp="OUTPUT",
52
            page_id=None,
53
            show_help=False,
54
            show_version=False,
55
            dump_json=False,
56
            version=None
57
    ):
58
        if parameter is None:
59
            parameter = {}
60
        if dump_json:
61
            print(json.dumps(ocrd_tool, indent=True))
62
            return
63
        self.ocrd_tool = ocrd_tool
64
        if show_help:
65
            self.show_help()
66
            return
67
        self.version = version
68
        if show_version:
69
            self.show_version()
70
            return
71
        self.workspace = workspace
72
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
73
        # but there is no way to do that in process here since it's an
74
        # overridden method. chdir is almost always an anti-pattern.
75
        if self.workspace:
76
            os.chdir(self.workspace.directory)
77
        self.input_file_grp = input_file_grp
78
        self.output_file_grp = output_file_grp
79
        self.page_id = None if page_id == [] or page_id is None else page_id
80
        parameterValidator = ParameterValidator(ocrd_tool)
81
        report = parameterValidator.validate(parameter)
82
        if not report.is_valid:
83
            raise Exception("Invalid parameters %s" % report.errors)
84
        self.parameter = parameter
85
86
    def show_help(self):
87
        print(generate_processor_help(self.ocrd_tool, processor_instance=self))
88
89
    def show_version(self):
90
        print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION))
91
92
    def verify(self):
93
        """
94
        Verify that the input fulfills the processor's requirements.
95
        """
96
        return True
97
98
    def process(self):
99
        """
100
        Process the workspace
101
        """
102
        raise Exception("Must be implemented")
103
104
105
    def add_metadata(self, pcgts):
106
        """
107
        Adds PAGE-XML MetadataItem describing the processing step
108
        """
109
        pcgts.get_Metadata().add_MetadataItem(
110
                MetadataItemType(type_="processingStep",
111
                    name=self.ocrd_tool['steps'][0],
112
                    value=self.ocrd_tool['executable'],
113
                    Labels=[LabelsType(
114
                        externalModel="ocrd-tool",
115
                        externalId="parameters",
116
                        Label=[LabelType(type_=name,
117
                            value=self.parameter[name])
118
                            for name in self.parameter.keys()])]))
119
120
    def resolve_resource(self, parameter_name, val):
121
        """
122
        Resolve a resource name with the algorithm in
123
        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
124
125
        Args:
126
            parameter_name (string): name of parameter to resolve resource for
127
            val (string): resource value to resolve
128
        """
129
        executable = self.ocrd_tool['executable']
130
        try:
131
            param = self.ocrd_tool['parameter'][parameter_name]
132
        except KeyError:
133
            raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name)
134
        if not param['mimetype']:
135
            raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" %
136
                             parameter_name)
137
        if val.startswith('http:') or val.startswith('https:'):
138
            cache_dir = join(XDG_CACHE_HOME, executable)
139
            cache_key = re.sub('[^A-Za-z0-9]', '', val)
140
            cache_fpath = join(cache_dir, cache_key)
141
            # TODO Proper caching (make head request for size, If-Modified etc)
142
            if not exists(cache_fpath):
143
                if not isdir(cache_dir):
144
                    makedirs(cache_dir)
145
                with requests.get(val, stream=True) as r:
146
                    with open(cache_fpath, 'wb') as f:
147
                        copyfileobj(r.raw, f)
148
            return cache_fpath
149
        ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)])
150
        if ret:
151
            return ret
152
        bundled_fpath = resource_filename(__name__, val)
153
        if exists(bundled_fpath):
154
            return bundled_fpath
155
        raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" %
156
                                (parameter_name, val))
157
158
    def list_all_resources(self):
159
        """
160
        List all resources found in the filesystem
161
        """
162
        return list_all_resources(self.ocrd_tool['executable'])
163
164
    @property
165
    def input_files(self):
166
        """
167
        List the input files.
168
169
        - If there's a PAGE-XML for the page, take it (and forget about all
170
          other files for that page)
171
        - Else if there's only one image, take it (and forget about all other
172
          files for that page)
173
        - Otherwise raise an error (complaining that only PAGE-XML warrants
174
175
          having multiple images for a single page)
176
        (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
177
        """
178
        ret = self.workspace.mets.find_all_files(
179
            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
180
        if ret:
181
            return ret
182
        ret = self.workspace.mets.find_all_files(
183
            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
184
        if self.page_id and len(ret) > 1:
185
            raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
186
                "for page '%s'" % self.page_id if self.page_id else '',
187
                self.input_file_grp
188
                ))
189
        return ret
190