Passed
Push — master ( fd842e...236931 )
by Konstantin
02:19
created

ocrd.processor.builtin.shell_processor   A

Complexity

Total Complexity 22

Size/Duplication

Total Lines 130
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 22
eloc 98
dl 0
loc 130
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
A cli() 0 4 1

4 Methods

Rating   Name   Duplication   Size   Complexity  
A ShellProcessor.setup() 0 6 3
A ShellProcessor.metadata_filename() 0 3 1
A ShellProcessor.executable() 0 3 1
F ShellProcessor.process_page_file() 0 88 16
1
# pylint: disable=missing-module-docstring,invalid-name
2
from typing import List, Optional, get_args
3
import os
4
import subprocess
5
from tempfile import TemporaryDirectory
6
7
import click
8
9
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
10
from ocrd import Processor
11
from ocrd.processor.base import MissingInputFile
12
from ocrd_models import OcrdPage, OcrdFileType
13
from ocrd_models.ocrd_page import to_xml
14
from ocrd_modelfactory import page_from_file
15
from ocrd_utils import config, make_file_id, MIMETYPE_PAGE
16
17
18
class ShellProcessor(Processor):
19
20
    def setup(self):
21
        command = self.parameter['command']
22
        if '@INFILE' not in command:
23
            raise Exception("command parameter requires @INFILE pattern")
24
        if '@OUTFILE' not in command:
25
            raise Exception("command parameter requires @OUTFILE pattern")
26
27
    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
28
        """
29
        Process PAGE files via arbitrary command line on the shell.
30
31
        \b
32
        For each selected physical page of the workspace, pass ``command`` 
33
        to the shell, replacing:
34
        - the string ``@INFILE`` with the PAGE input file path,
35
        - the string ``@OUTFILE`` with the PAGE output file path.
36
37
        Modify the resulting PAGE output file with our new `@pcGtsId` and
38
        metadata.
39
        """
40
        input_paths: List[str] = [""] * len(input_files)
41
        input_pos = next(i for i, input_file in enumerate(input_files)
42
                         if input_file is not None)
43
        page_id = input_files[input_pos].pageId
44
        self._base_logger.info("processing page %s", page_id)
45
        for i, input_file in enumerate(input_files):
46
            grp = self.input_file_grp.split(',')[i]
47
            if input_file is None:
48
                self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
49
                continue
50
            assert isinstance(input_file, get_args(OcrdFileType))
51
            if not input_file.local_filename:
52
                self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
53
                if config.OCRD_MISSING_INPUT == 'ABORT':
54
                    raise MissingInputFile(grp, page_id, input_file.mimetype)
55
                continue
56
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
57
            if os.path.exists(input_file.local_filename):
58
                input_paths[i] = input_file.local_filename
59
            else:
60
                self._base_logger.error(f"non-existing local file for input fileGrp {grp} for page {page_id}")
61
        if not any(input_paths):
62
            self._base_logger.warning(f'skipping page {page_id}')
63
            return
64
        output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
65
        if input_files[input_pos].fileGrp == self.output_file_grp:
66
            # input=output fileGrp: re-use ID exactly
67
            output_file_id = input_files[input_pos].ID
68
        output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
69
        if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
70
            # short-cut avoiding useless computation:
71
            raise FileExistsError(
72
                f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
73
            )
74
        command = self.parameter['command']
75
        with TemporaryDirectory(suffix=page_id) as tmpdir:
76
            out_path = os.path.join(tmpdir, output_file_id + ".xml")
77
            # remove quotation around filename patterns, if any
78
            command = command.replace('"@INFILE"', '@INFILE').replace('"@OUTFILE"', '@OUTFILE')
79
            command = command.replace("'@INFILE'", '@INFILE').replace("'@OUTFILE'", '@OUTFILE')
80
            # replace filename patterns with actual paths, quoted
81
            for in_path in input_paths:
82
                command = command.replace('@INFILE', '"' + in_path + '"', 1)
83
            command = command.replace('@OUTFILE', '"' + out_path + '"')
84
            # execute command pattern
85
            self.logger.debug("Running command: '%s'", command)
86
            # pylint: disable=subprocess-run-check
87
            result = subprocess.run(command, shell=True,
88
                                    universal_newlines=True,
89
                                    stdout=subprocess.PIPE,
90
                                    stderr=subprocess.PIPE)
91
            self.logger.debug("Command for %s returned: %d", page_id, result.returncode)
92
            if result.stdout:
93
                self.logger.info("Command for %s stdout: %s", page_id, result.stdout)
94
            if result.stderr:
95
                self.logger.warning("Command for %s stderr: %s", page_id, result.stderr)
96
            if result.returncode != 0:
97
                self.logger.error("Command for %s failed", page_id)
98
                return
99
            try:
100
                result = page_from_file(out_path)
101
                assert isinstance(result, OcrdPage)
102
            except ValueError as err:
103
                # not PAGE and not an image to generate PAGE for
104
                self._base_logger.error(f"non-PAGE output for page {page_id}: {err}")
105
                return
106
        result.set_pcGtsId(output_file_id)
107
        self.add_metadata(result)
108
        self.workspace.add_file(
109
            file_id=output_file_id,
110
            file_grp=self.output_file_grp,
111
            page_id=page_id,
112
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
113
            mimetype=MIMETYPE_PAGE,
114
            content=to_xml(result),
115
        )
116
117
    @property
118
    def metadata_filename(self):
119
        return 'processor/builtin/dummy/ocrd-tool.json'
120
121
    @property
122
    def executable(self):
123
        return 'ocrd-command'
124
125
126
@click.command()
127
@ocrd_cli_options
128
def cli(*args, **kwargs):
129
    return ocrd_cli_wrap_processor(ShellProcessor, *args, **kwargs)
130